Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jun 3

Commit

0a1b314

•

1 Parent(s): b462f85

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

artifact.py +145 -1
blocks.py +1 -1
collections_operators.py +3 -3
dict_utils.py +15 -13
formats.py +2 -2
inference.py +10 -16
loaders.py +344 -36
metrics.py +129 -45
normalizers.py +2 -2
operator.py +22 -28
operators.py +111 -33
schema.py +3 -11
settings_utils.py +1 -1
span_lableing_operators.py +2 -2
standard.py +13 -2
string_operators.py +2 -2
struct_data_operators.py +13 -13
system_prompts.py +2 -2
task.py +6 -4
templates.py +2 -2
text_utils.py +3 -0
type_utils.py +17 -17
validate.py +2 -2
version.py +1 -1

artifact.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import pkgutil
 from abc import abstractmethod
 from copy import deepcopy
-from typing import Dict, List, Optional, Union, final
 from .dataclass import (
     AbstractField,
@@ -129,6 +129,10 @@ class Artifact(Dataclass):
     )
     __id__: str = InternalField(default=None, required=False, also_positional=False)
     @classmethod
     def is_artifact_dict(cls, d):
         return isinstance(d, dict) and "type" in d
@@ -226,6 +230,11 @@ class Artifact(Dataclass):
         new_artifact.__id__ = artifact_identifier
         return new_artifact
     def prepare(self):
         pass
@@ -236,6 +245,20 @@ class Artifact(Dataclass):
     def __pre_init__(self, **kwargs):
         self._init_dict = get_raw(kwargs)
     @final
     def __post_init__(self):
         self.type = self.register_class(self.__class__)
@@ -248,6 +271,7 @@ class Artifact(Dataclass):
                 value = map_values_in_place(value, maybe_recover_artifact)
                 setattr(self, field.name, value)
         if not settings.skip_artifacts_prepare_and_verify:
             self.prepare()
             self.verify()
@@ -259,6 +283,76 @@ class Artifact(Dataclass):
         data = self.to_dict()
         save_json(path, data)
 def get_raw(obj):
     if isinstance(obj, Artifact):
@@ -367,3 +461,53 @@ def register_all_artifacts(path):
                 # Make sure the class is a subclass of Artifact (but not Artifact itself)
                 if issubclass(obj, Artifact) and obj is not Artifact:
                     logger.info(obj)

 import pkgutil
 from abc import abstractmethod
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Union, final
 from .dataclass import (
     AbstractField,
     )
     __id__: str = InternalField(default=None, required=False, also_positional=False)
+    data_classification_policy: List[str] = NonPositionalField(
+        default=None, required=False, also_positional=False
+    )
     @classmethod
     def is_artifact_dict(cls, d):
         return isinstance(d, dict) and "type" in d
         new_artifact.__id__ = artifact_identifier
         return new_artifact
+    def get_pretty_print_name(self):
+        if self.__id__ is not None:
+            return self.__id__
+        return self.__class__.__name__
     def prepare(self):
         pass
     def __pre_init__(self, **kwargs):
         self._init_dict = get_raw(kwargs)
+    @final
+    def verify_data_classification_policy(self):
+        if self.data_classification_policy is not None:
+            if not isinstance(self.data_classification_policy, list) or not all(
+                isinstance(data_classification, str)
+                for data_classification in self.data_classification_policy
+            ):
+                raise ValueError(
+                    f"The 'data_classification_policy' of {self.get_pretty_print_name()} "
+                    f"must be either None - in case when no policy applies - or a list of "
+                    f"strings, for example: ['public']. However, '{self.data_classification_policy}' "
+                    f"of type {type(self.data_classification_policy)} was provided instead."
+                )
     @final
     def __post_init__(self):
         self.type = self.register_class(self.__class__)
                 value = map_values_in_place(value, maybe_recover_artifact)
                 setattr(self, field.name, value)
+        self.verify_data_classification_policy()
         if not settings.skip_artifacts_prepare_and_verify:
             self.prepare()
             self.verify()
         data = self.to_dict()
         save_json(path, data)
+    def verify_instance(
+        self, instance: Dict[str, Any], name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Checks if data classifications of an artifact and instance are compatible.
+        Raises an error if an artifact's data classification policy does not include that of
+        processed data. The purpose is to ensure that any sensitive data is handled in a
+        proper way (for example when sending it to some external services).
+        Args:
+            instance (Dict[str, Any]): data which should contain its allowed data
+                classification policies under key 'data_classification_policy'.
+            name (Optional[str]): name of artifact which should be used to retrieve
+                data classification from env. If not specified, then either __id__ or
+                 __class__.__name__, are used instead, respectively.
+        Returns:
+            Dict[str, Any]: unchanged instance.
+        Examples:
+            instance = {"x": "some_text", "data_classification_policy": ["pii"]}
+            # Will raise an error as "pii" is not included policy
+            metric = Accuracy(data_classification_policy=["public"])
+            metric.verify_instance(instance)
+            # Will not raise an error
+            template = SpanLabelingTemplate(data_classification_policy=["pii", "propriety"])
+            template.verify_instance(instance)
+            # Will not raise an error since the policy was specified in environment variable:
+            UNITXT_DATA_CLASSIFICATION_POLICY = json.dumps({"metrics.accuracy": ["pii"]})
+            metric = fetch_artifact("metrics.accuracy")
+            metric.verify_instance(instance)
+        """
+        name = name or self.get_pretty_print_name()
+        data_classification_policy = get_artifacts_data_classification(name)
+        if not data_classification_policy:
+            data_classification_policy = self.data_classification_policy
+        if not data_classification_policy:
+            return instance
+        instance_data_classification = instance.get("data_classification_policy")
+        if not instance_data_classification:
+            get_logger().warning(
+                f"The data does not provide information if it can be used by "
+                f"'{name}' with the following data classification policy "
+                f"'{data_classification_policy}'. This may lead to sending of undesired "
+                f"data to external service. Set the 'data_classification_policy' "
+                f"of the data to ensure a proper handling of sensitive information."
+            )
+            return instance
+        if not any(
+            data_classification in data_classification_policy
+            for data_classification in instance_data_classification
+        ):
+            raise ValueError(
+                f"The instance '{instance} 'has the following data classification policy "
+                f"'{instance_data_classification}', however, the artifact '{name}' "
+                f"is only configured to support the data with classification "
+                f"'{data_classification_policy}'. To enable this either change "
+                f"the 'data_classification_policy' attribute of the artifact, "
+                f"or modify the environment variable "
+                f"'UNITXT_DATA_CLASSIFICATION_POLICY' accordingly."
+            )
+        return instance
 def get_raw(obj):
     if isinstance(obj, Artifact):
                 # Make sure the class is a subclass of Artifact (but not Artifact itself)
                 if issubclass(obj, Artifact) and obj is not Artifact:
                     logger.info(obj)
+def get_artifacts_data_classification(artifact: str) -> Optional[List[str]]:
+    """Loads given artifact's data classification policy from an environment variable.
+    Args:
+        artifact (str): Name of the artifact which the data classification policy
+            should be retrieved for. For example "metrics.accuracy".
+    Returns:
+        Optional[List[str]] - Data classification policies for the specified artifact
+            if they were found, or None otherwise.
+    """
+    data_classification = settings.data_classification_policy
+    if data_classification is None:
+        return None
+    error_msg = (
+        f"If specified, the value of 'UNITXT_DATA_CLASSIFICATION_POLICY' "
+        f"should be a valid json dictionary. Got '{data_classification}' "
+        f"instead."
+    )
+    try:
+        data_classification = json.loads(data_classification)
+    except json.decoder.JSONDecodeError as e:
+        raise RuntimeError(error_msg) from e
+    if not isinstance(data_classification, dict):
+        raise RuntimeError(error_msg)
+    for artifact_name, artifact_data_classifications in data_classification.items():
+        if (
+            not isinstance(artifact_name, str)
+            or not isinstance(artifact_data_classifications, list)
+            or not all(
+                isinstance(artifact_data_classification, str)
+                for artifact_data_classification in artifact_data_classifications
+            )
+        ):
+            raise RuntimeError(
+                "'UNITXT_DATA_CLASSIFICATION_POLICY' should be of type "
+                "'Dict[str, List[str]]', where a artifact's name is a key, and a "
+                "value is a list of data classifications used by that artifact."
+            )
+    if artifact not in data_classification.keys():
+        return None
+    return data_classification.get(artifact)

blocks.py CHANGED Viewed

@@ -31,7 +31,7 @@ from .struct_data_operators import (
     TruncateTableCells,
     TruncateTableRows,
 )
-from .task import Task
 from .templates import (
     InputOutputTemplate,
     MultiLabelTemplate,

     TruncateTableCells,
     TruncateTableRows,
 )
+from .task import FormTask, Task
 from .templates import (
     InputOutputTemplate,
     MultiLabelTemplate,

collections_operators.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from copy import deepcopy
 from typing import Any, Generator, List, Optional
-from .operators import FieldOperator, SingleStreamOperator
 from .stream import Stream
@@ -58,7 +58,7 @@ class Get(FieldOperator):
         return collection[self.item]
-class DuplicateByList(SingleStreamOperator):
     field: str
     to_field: Optional[str] = None
     use_deep_copy: bool = False
@@ -80,7 +80,7 @@ class DuplicateByList(SingleStreamOperator):
                 yield instance_copy
-class DuplicateBySubLists(SingleStreamOperator):
     field: str
     to_field: Optional[str] = None
     use_deep_copy: bool = False

 from copy import deepcopy
 from typing import Any, Generator, List, Optional
+from .operators import FieldOperator, StreamOperator
 from .stream import Stream
         return collection[self.item]
+class DuplicateByList(StreamOperator):
     field: str
     to_field: Optional[str] = None
     use_deep_copy: bool = False
                 yield instance_copy
+class DuplicateBySubLists(StreamOperator):
     field: str
     to_field: Optional[str] = None
     use_deep_copy: bool = False

dict_utils.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import re
 from typing import Any, List, Tuple
 indx = re.compile(r"^(\d+)$")
 name = re.compile(r"^[\w. -]+$")
@@ -395,22 +397,20 @@ def dict_get(
     if len(components) > 1:
         try:
             success, values = get_values(dic, components, -1 * len(components))
-            if not success:
-                if not_exist_ok:
-                    return default
-                raise ValueError(
-                    f'query "{query}" did not match any item in dict: {dic}'
-                )
-            return values
         except Exception as e:
-            if not_exist_ok:
-                return default
             raise ValueError(
-                f'query "{query}" did not match any item in dict: {dic}'
             ) from e
     # len(components) == 1
     if components[0] in dic:
         return dic[components[0]]
@@ -418,7 +418,9 @@ def dict_get(
     if not_exist_ok:
         return default
-    raise ValueError(f'query "{query}" did not match any item in dict: {dic}')
 # dict_set sets a value, 'value', which by itself, can be a dict or list or scalar, into 'dic', to become the value of

 import re
 from typing import Any, List, Tuple
+from .text_utils import construct_dict_str
 indx = re.compile(r"^(\d+)$")
 name = re.compile(r"^[\w. -]+$")
     if len(components) > 1:
         try:
             success, values = get_values(dic, components, -1 * len(components))
+            if success:
+                return values
         except Exception as e:
             raise ValueError(
+                f'query "{query}" did not match any item in dict:\n{construct_dict_str(dic)}'
             ) from e
+        if not_exist_ok:
+            return default
+        raise ValueError(
+            f'query "{query}" did not match any item in dict:\n{construct_dict_str(dic)}'
+        )
     # len(components) == 1
     if components[0] in dic:
         return dic[components[0]]
     if not_exist_ok:
         return default
+    raise ValueError(
+        f'query "{query}" did not match any item in dict:\n{construct_dict_str(dic)}'
+    )
 # dict_set sets a value, 'value', which by itself, can be a dict or list or scalar, into 'dic', to become the value of

formats.py CHANGED Viewed

@@ -7,11 +7,11 @@ from typing import (
 )
 from .dataclass import OptionalField
-from .operator import StreamInstanceOperator
 from .type_utils import isoftype
-class Format(StreamInstanceOperator):
     pass

 )
 from .dataclass import OptionalField
+from .operator import InstanceOperator
 from .type_utils import isoftype
+class Format(InstanceOperator):
     pass

inference.py CHANGED Viewed

@@ -5,24 +5,20 @@ from typing import Any, Dict, List, Literal, Optional, Union
 from .artifact import Artifact
 from .operator import PackageRequirementsMixin
-from .settings_utils import get_settings
 class InferenceEngine(abc.ABC, Artifact):
     """Abstract base class for inference."""
     @abc.abstractmethod
-    def infer(self, dataset):
         """Perform inference on the input dataset."""
         pass
-    @staticmethod
-    def _assert_allow_passing_data_to_remote_api(remote_api_label: str):
-        assert get_settings().allow_passing_data_to_remote_api, (
-            f"LlmAsJudge metric cannot run send data to remote APIs ({remote_api_label}) when"
-            f" unitxt.settings.allow_passing_data_to_remote_api=False."
-            f" Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this. "
-        )
 class HFPipelineBasedInferenceEngine(InferenceEngine, PackageRequirementsMixin):
@@ -73,7 +69,7 @@ class HFPipelineBasedInferenceEngine(InferenceEngine, PackageRequirementsMixin):
             model=self.model_name, trust_remote_code=True, **model_args
         )
-    def infer(self, dataset):
         outputs = []
         for output in self.model([instance["source"] for instance in dataset]):
             if isinstance(output, list):
@@ -88,7 +84,7 @@ class MockInferenceEngine(InferenceEngine):
     def prepare(self):
         return
-    def infer(self, dataset):
         return ["[[10]]" for instance in dataset]
@@ -114,6 +110,7 @@ class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     _requirement = {
         "genai": "Install ibm-genai package using 'pip install --upgrade ibm-generative-ai"
     }
     def prepare(self):
         from genai import Client, Credentials
@@ -128,9 +125,7 @@ class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
         credentials = Credentials(api_key=api_key, api_endpoint=api_endpoint)
         self.client = Client(credentials=credentials)
-        self._assert_allow_passing_data_to_remote_api(self.label)
-    def infer(self, dataset):
         from genai.schema import TextGenerationParameters
         genai_params = TextGenerationParameters(
@@ -186,9 +181,8 @@ class OpenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
         )
         self.client = OpenAI(api_key=api_key)
-        self._assert_allow_passing_data_to_remote_api(self.label)
-    def infer(self, dataset):
         return [
             self.client.chat.completions.create(
                 messages=[

 from .artifact import Artifact
 from .operator import PackageRequirementsMixin
 class InferenceEngine(abc.ABC, Artifact):
     """Abstract base class for inference."""
     @abc.abstractmethod
+    def _infer(self, dataset):
         """Perform inference on the input dataset."""
         pass
+    def infer(self, dataset):
+        """Verifies instances of a dataset and performs inference."""
+        [self.verify_instance(instance) for instance in dataset]
+        return self._infer(dataset)
 class HFPipelineBasedInferenceEngine(InferenceEngine, PackageRequirementsMixin):
             model=self.model_name, trust_remote_code=True, **model_args
         )
+    def _infer(self, dataset):
         outputs = []
         for output in self.model([instance["source"] for instance in dataset]):
             if isinstance(output, list):
     def prepare(self):
         return
+    def _infer(self, dataset):
         return ["[[10]]" for instance in dataset]
     _requirement = {
         "genai": "Install ibm-genai package using 'pip install --upgrade ibm-generative-ai"
     }
+    data_classification_policy = ["public", "proprietary"]
     def prepare(self):
         from genai import Client, Credentials
         credentials = Credentials(api_key=api_key, api_endpoint=api_endpoint)
         self.client = Client(credentials=credentials)
+    def _infer(self, dataset):
         from genai.schema import TextGenerationParameters
         genai_params = TextGenerationParameters(
         )
         self.client = OpenAI(api_key=api_key)
+    def _infer(self, dataset):
         return [
             self.client.chat.completions.create(
                 messages=[

loaders.py CHANGED Viewed

@@ -15,16 +15,26 @@ Unitxt catalog contains several loaders for the most popular datasource formats.
 All these loaders inherit from Loader, and hence, implementing a loader to expand over a new type of datasource, is
 straight forward.
-Operators in Unitxt catalog:
-LoadHF : loads from Huggingface dataset.
-LoadCSV: loads from csv (comma separated value) files
-LoadFromKaggle: loads datasets from the kaggle.com community site
-LoadFromIBMCloud: loads a dataset from the IBM cloud.
 ------------------------
 """
 import itertools
 import os
 import tempfile
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
@@ -37,6 +47,7 @@ from .dataclass import InternalField, OptionalField
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .settings_utils import get_settings
 from .stream import GeneratorStream, MultiStream
@@ -45,12 +56,22 @@ settings = get_settings()
 class Loader(SourceOperator):
-    # The loader_limit an optional parameter used to control the maximum number of instances to load from the the source.
-    # It is usually provided to the loader via the recipe (see standard.py)
-    # The loader can use this value to limit the amount of data downloaded from the source
-    # to reduce loading time.  However, this may not always be possible, so the
-    # loader may ignore this.  In any case, the recipe, will limit the number of instances in the returned
-    # stream, after load is complete.
     loader_limit: int = None
     streaming: bool = False
@@ -75,8 +96,66 @@ class Loader(SourceOperator):
             f"\nLoading limited to {self.get_limit()} instances by setting {self.get_limiter()};"
         )
 class LoadHF(Loader):
     path: str
     name: Optional[str] = None
     data_dir: Optional[str] = None
@@ -187,7 +266,15 @@ class LoadHF(Loader):
             }
         )
-    def process(self):
         try:
             dataset = self.stream_dataset()
         except (
@@ -202,6 +289,25 @@ class LoadHF(Loader):
 class LoadCSV(Loader):
     files: Dict[str, str]
     chunksize: int = 1000
     _cache = InternalField(default_factory=dict)
@@ -236,7 +342,10 @@ class LoadCSV(Loader):
         yield from self._cache[file]
-    def process(self):
         if self.streaming:
             return MultiStream(
                 {
@@ -258,8 +367,25 @@ class LoadCSV(Loader):
 class LoadFromSklearn(Loader):
     dataset_name: str
     splits: List[str] = ["train", "test"]
     _requirements_list: List[str] = ["sklearn", "pandas"]
@@ -275,7 +401,7 @@ class LoadFromSklearn(Loader):
         self.downloader = getattr(sklearn_datatasets, f"fetch_{self.dataset_name}")
-    def process(self):
         with TemporaryDirectory() as temp_directory:
             for split in self.splits:
                 split_data = self.downloader(subset=split)
@@ -293,8 +419,25 @@ class MissingKaggleCredentialsError(ValueError):
 class LoadFromKaggle(Loader):
     url: str
     _requirements_list: List[str] = ["opendatasets"]
     def verify(self):
         super().verify()
@@ -312,7 +455,7 @@ class LoadFromKaggle(Loader):
         self.downloader = download
-    def process(self):
         with TemporaryDirectory() as temp_directory:
             self.downloader(self.url, temp_directory)
             dataset = hf_load_dataset(temp_directory, streaming=False)
@@ -321,18 +464,47 @@ class LoadFromKaggle(Loader):
 class LoadFromIBMCloud(Loader):
     endpoint_url_env: str
     aws_access_key_id_env: str
     aws_secret_access_key_env: str
     bucket_name: str
     data_dir: str = None
-    # Can be either:
-    # 1. a list of file names, the split of each file is determined by the file name pattern
-    # 2. Mapping: split -> file_name, e.g. {"test" : "test.json", "train": "train.json"}
-    # 3. Mapping: split -> file_names, e.g. {"test" : ["test1.json", "test2.json"], "train": ["train.json"]}
     data_files: Union[Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     caching: bool = True
     _requirements_list: List[str] = ["ibm_boto3"]
     def _download_from_cos(self, cos, bucket_name, item_name, local_file):
@@ -400,7 +572,10 @@ class LoadFromIBMCloud(Loader):
         if self.streaming:
             raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
-    def process(self):
         import ibm_boto3
         cos = ibm_boto3.resource(
@@ -458,23 +633,37 @@ class LoadFromIBMCloud(Loader):
 class MultipleSourceLoader(Loader):
-    """Allow loading data from multiple sources.
     Examples:
-    1) Loading the train split from Huggingface hub and the test set from a local file:
-    MultipleSourceLoader(loaders = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
-    2) Loading a test set combined from two files
-    MultipleSourceLoader(loaders = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
     """
     sources: List[Loader]
-    def process(self):
         return FixedFusion(
             origins=self.sources, max_instances_per_origin_split=self.get_limit()
         ).process()
@@ -485,19 +674,138 @@ class LoadFromDictionary(Loader):
     The loader can be used, for example, when debugging or working with small datasets.
-    Attributes:
         data (Dict[str, List[Dict[str, Any]]]): a dictionary of constants from which the data will be loaded
-    Examples:
-        data = {
-            "train": {"input": "SomeInput1", "output": "SomeResult1"},
-            "test": {"input": "SomeInput2", "output": "SomeResult2"},
-        }
-        loader = LoadFromDictionary(data=data)
-        multi_stream = loader.process()
     """
     data: Dict[str, List[Dict[str, Any]]]
-    def process(self) -> MultiStream:
-        return MultiStream.from_iterables(self.data)

 All these loaders inherit from Loader, and hence, implementing a loader to expand over a new type of datasource, is
 straight forward.
+Available Loaders Overview:
+    - :ref:`LoadHF <unitxt.loaders.LoadHF>` - Loads data from Huggingface datasets.
+    - :ref:`LoadCSV <unitxt.loaders.LoadCSV>` - Imports data from CSV (Comma-Separated Values) files.
+    - :ref:`LoadFromKaggle <unitxt.loaders.LoadFromKaggle>` - Retrieves datasets from the Kaggle community site.
+    - :ref:`LoadFromIBMCloud <unitxt.loaders.LoadFromIBMCloud>` - Fetches datasets hosted on IBM Cloud.
+    - :ref:`LoadFromSklearn <unitxt.loaders.LoadFromSklearn>` - Loads datasets available through the sklearn library.
+    - :ref:`MultipleSourceLoader <unitxt.loaders.MultipleSourceLoader>` - Combines data from multiple different sources.
+    - :ref:`LoadFromDictionary <unitxt.loaders.LoadFromDictionary>` - Loads data from a user-defined Python dictionary.
+    - :ref:`LoadFromHFSpace <unitxt.loaders.LoadFromHFSpace>` - Downloads and loads data from Huggingface Spaces.
 ------------------------
 """
 import itertools
 import os
 import tempfile
+from abc import abstractmethod
+from copy import deepcopy
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
+from .operators import AddFields
 from .settings_utils import get_settings
 from .stream import GeneratorStream, MultiStream
 class Loader(SourceOperator):
+    """A base class for all loaders.
+    A loader is the first component in the Unitxt Recipe,
+    responsible for loading data from various sources and preparing it as a MultiStream for processing.
+    The loader_limit an optional parameter used to control the maximum number of instances to load from the data source.  It is applied for each split separately.
+    It is usually provided to the loader via the recipe (see standard.py)
+    The loader can use this value to limit the amount of data downloaded from the source
+    to reduce loading time.  However, this may not always be possible, so the
+    loader may ignore this.  In any case, the recipe, will limit the number of instances in the returned
+    stream, after load is complete.
+    Args:
+        loader_limit: Optional integer to specify a limit on the number of records to load.
+        streaming: Bool indicating if streaming should be used.
+    """
     loader_limit: int = None
     streaming: bool = False
             f"\nLoading limited to {self.get_limit()} instances by setting {self.get_limiter()};"
         )
+    def add_data_classification(self, multi_stream: MultiStream) -> MultiStream:
+        if self.data_classification_policy is None:
+            get_logger().warning(
+                f"The {self.get_pretty_print_name()} loader does not set the `data_classification_policy`. "
+                f"This may lead to sending of undesired data to external services.\n"
+                f"Set it to a list of classification identifiers. \n"
+                f"For example:\n"
+                f"data_classification_policy = ['public']\n"
+                f" or \n"
+                f"data_classification_policy =['confidential','pii'])\n"
+            )
+        operator = AddFields(
+            fields={"data_classification_policy": self.data_classification_policy}
+        )
+        return operator(multi_stream)
+    def sef_default_data_classification(
+        self, default_data_classification_policy, additional_info
+    ):
+        if self.data_classification_policy is None:
+            logger.info(
+                f"{self.get_pretty_print_name()} sets 'data_classification_policy' to "
+                f"{default_data_classification_policy} by default {additional_info}.\n"
+                "To use a different value or remove this message, explicitly set the "
+                "`data_classification_policy` attribute of the loader.\n"
+            )
+            self.data_classification_policy = default_data_classification_policy
+    @abstractmethod
+    def load_data(self):
+        pass
+    def process(self) -> MultiStream:
+        return self.add_data_classification(self.load_data())
 class LoadHF(Loader):
+    """Loads datasets from the Huggingface Hub.
+    It supports loading with or without streaming,
+    and can filter datasets upon loading.
+    Args:
+        path: The path or identifier of the dataset on the Huggingface Hub.
+        name: An optional dataset name.
+        data_dir: Optional directory to store downloaded data.
+        split: Optional specification of which split to load.
+        data_files: Optional specification of particular data files to load.
+        streaming: Bool indicating if streaming should be used.
+        filtering_lambda: A lambda function for filtering the data after loading.
+    Example:
+        Loading glue's mrpc dataset
+        .. code-block:: python
+            load_hf = LoadHF(path='glue', name='mrpc')
+    """
     path: str
     name: Optional[str] = None
     data_dir: Optional[str] = None
             }
         )
+    def load_data(self):
+        if os.path.exists(self.path):
+            self.sef_default_data_classification(
+                ["proprietary"], "when loading from local files"
+            )
+        else:
+            self.sef_default_data_classification(
+                ["public"], "when loading from Huggingface hub"
+            )
         try:
             dataset = self.stream_dataset()
         except (
 class LoadCSV(Loader):
+    """Loads data from CSV files.
+    Supports streaming and can handle large files by loading them in chunks.
+    Args:
+        files (Dict[str, str]): A dictionary mapping names to file paths.
+        chunksize : Size of the chunks to load at a time.
+        loader_limit: Optional integer to specify a limit on the number of records to load.
+        streaming: Bool indicating if streaming should be used.
+        sep: String specifying the separator used in the CSV files.
+    Example:
+        Loading csv
+        .. code-block:: python
+            load_csv = LoadCSV(files={'train': 'path/to/train.csv'}, chunksize=100)
+    """
     files: Dict[str, str]
     chunksize: int = 1000
     _cache = InternalField(default_factory=dict)
         yield from self._cache[file]
+    def load_data(self):
+        self.sef_default_data_classification(
+            ["proprietary"], "when loading from local files"
+        )
         if self.streaming:
             return MultiStream(
                 {
 class LoadFromSklearn(Loader):
+    """Loads datasets from the sklearn library.
+    This loader does not support streaming and is intended for use with sklearn's dataset fetch functions.
+    Args:
+        dataset_name: The name of the sklearn dataset to fetch.
+        splits: A list of data splits to load, e.g., ['train', 'test'].
+    Example:
+        Loading form sklearn
+        .. code-block:: python
+            load_sklearn = LoadFromSklearn(dataset_name='iris', splits=['train', 'test'])
+    """
     dataset_name: str
     splits: List[str] = ["train", "test"]
+    data_classification_policy = ["public"]
     _requirements_list: List[str] = ["sklearn", "pandas"]
         self.downloader = getattr(sklearn_datatasets, f"fetch_{self.dataset_name}")
+    def load_data(self):
         with TemporaryDirectory() as temp_directory:
             for split in self.splits:
                 split_data = self.downloader(subset=split)
 class LoadFromKaggle(Loader):
+    """Loads datasets from Kaggle.
+    Requires Kaggle API credentials and does not support streaming.
+    Args:
+        url: URL to the Kaggle dataset.
+    Example:
+        Loading from kaggle
+        .. code-block:: python
+            load_kaggle = LoadFromKaggle(url='kaggle.com/dataset/example')
+    """
     url: str
     _requirements_list: List[str] = ["opendatasets"]
+    data_classification_policy = ["public"]
     def verify(self):
         super().verify()
         self.downloader = download
+    def load_data(self):
         with TemporaryDirectory() as temp_directory:
             self.downloader(self.url, temp_directory)
             dataset = hf_load_dataset(temp_directory, streaming=False)
 class LoadFromIBMCloud(Loader):
+    """Loads data from IBM Cloud Object Storage.
+    Does not support streaming and requires AWS-style access keys.
+    data_dir Can be either:
+    1. a list of file names, the split of each file is determined by the file name pattern
+    2. Mapping: split -> file_name, e.g. {"test" : "test.json", "train": "train.json"}
+    3. Mapping: split -> file_names, e.g. {"test" : ["test1.json", "test2.json"], "train": ["train.json"]}
+    Args:
+        endpoint_url_env: Environment variable name for the IBM Cloud endpoint URL.
+        aws_access_key_id_env: Environment variable name for the AWS access key ID.
+        aws_secret_access_key_env: Environment variable name for the AWS secret access key.
+        bucket_name: Name of the S3 bucket from which to load data.
+        data_dir: Optional directory path within the bucket.
+        data_files: Union type allowing either a list of file names or a mapping of splits to file names.
+        caching: Bool indicating if caching is enabled to avoid re-downloading data.
+    Example:
+        Loading from IBM Cloud
+        .. code-block:: python
+            load_ibm_cloud = LoadFromIBMCloud(
+                endpoint_url_env='IBM_CLOUD_ENDPOINT',
+                aws_access_key_id_env='IBM_AWS_ACCESS_KEY_ID',
+                aws_secret_access_key_env='IBM_AWS_SECRET_ACCESS_KEY',
+                bucket_name='my-bucket'
+            )
+            multi_stream = load_ibm_cloud.process()
+    """
     endpoint_url_env: str
     aws_access_key_id_env: str
     aws_secret_access_key_env: str
     bucket_name: str
     data_dir: str = None
     data_files: Union[Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     caching: bool = True
+    data_classification_policy = ["proprietary"]
     _requirements_list: List[str] = ["ibm_boto3"]
     def _download_from_cos(self, cos, bucket_name, item_name, local_file):
         if self.streaming:
             raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
+    def load_data(self):
+        self.sef_default_data_classification(
+            ["proprietary"], "when loading from IBM COS"
+        )
         import ibm_boto3
         cos = ibm_boto3.resource(
 class MultipleSourceLoader(Loader):
+    """Allows loading data from multiple sources, potentially mixing different types of loaders.
+    Args:
+        sources: A list of loaders that will be combined to form a unified dataset.
     Examples:
+        1) Loading the train split from Huggingface hub and the test set from a local file:
+        .. code-block:: python
+            MultipleSourceLoader(loaders = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
+        2) Loading a test set combined from two files
+        .. code-block:: python
+            MultipleSourceLoader(loaders = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
     """
     sources: List[Loader]
+    # MultipleSourceLoaders uses the the data classification from source loaders,
+    # so only need to add it, if explicitly requested to override.
+    def add_data_classification(self, multi_stream: MultiStream) -> MultiStream:
+        if self.data_classification_policy is None:
+            return multi_stream
+        return super().add_data_classification(multi_stream)
+    def load_data(self):
         return FixedFusion(
             origins=self.sources, max_instances_per_origin_split=self.get_limit()
         ).process()
     The loader can be used, for example, when debugging or working with small datasets.
+    Args:
         data (Dict[str, List[Dict[str, Any]]]): a dictionary of constants from which the data will be loaded
+    Example:
+        Loading dictionary
+        .. code-block:: python
+            data = {
+                "train": {"input": "SomeInput1", "output": "SomeResult1"},
+                "test": {"input": "SomeInput2", "output": "SomeResult2"},
+            }
+            loader = LoadFromDictionary(data=data)
     """
     data: Dict[str, List[Dict[str, Any]]]
+    def load_data(self) -> MultiStream:
+        self.sef_default_data_classification(
+            ["proprietary"], "when loading from python dictionary"
+        )
+        return MultiStream.from_iterables(deepcopy(self.data))
+class LoadFromHFSpace(LoadHF):
+    """Used to load data from Huggingface spaces.
+    Loaders firstly tries to download all files specified in the 'data_files' parameter
+    from the given space and then reads them as a Huggingface dataset.
+    Args:
+        space_name (str): Name of the Huggingface space to be accessed to.
+        data_files (str | Sequence[str] | Mapping[str, str | Sequence[str]]): Relative
+            paths to files within a given repository. If given as a mapping, paths should
+            be values, while keys should represent the type of respective files
+            (training, testing etc.).
+        path (str, optional): Absolute path to a directory where data should be downloaded to.
+        revision (str, optional): ID of a Git branch or commit to be used. By default, it is
+            set to None, thus data is downloaded from the main branch of the accessed
+            repository.
+        use_token (bool, optional): Whether token used for authentication when accessing
+            the Huggingface space - if necessary - should be read from the Huggingface
+            config folder.
+        token_env (str, optional): Key of an env variable which value will be used for
+            authentication when accessing the Huggingface space - if necessary.
+    Example:
+        Loading from Huggingface Space
+        .. code-block:: python
+            loader = LoadFromHFSpace(
+                space_name="lmsys/mt-bench",
+                data_files={
+                    "train": [
+                        "data/mt_bench/model_answer/gpt-3.5-turbo.jsonl",
+                        "data/mt_bench/model_answer/gpt-4.jsonl",
+                    ],
+                    "test": "data/mt_bench/model_answer/tulu-30b.jsonl",
+                },
+            )
+    """
+    space_name: str
+    data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
+    path: Optional[str] = None
+    revision: Optional[str] = None
+    use_token: Optional[bool] = None
+    token_env: Optional[str] = None
+    requirements_list: List[str] = ["huggingface_hub"]
+    def _get_token(self) -> Optional[Union[bool, str]]:
+        if self.token_env:
+            token = os.getenv(self.token_env)
+            if not token:
+                get_logger().warning(
+                    f"The 'token_env' parameter was specified as '{self.token_env}', "
+                    f"however, no environment variable under such a name was found. "
+                    f"Therefore, the loader will not use any tokens for authentication."
+                )
+            return token
+        return self.use_token
+    def _download_file_from_space(self, filename: str) -> str:
+        from huggingface_hub import hf_hub_download
+        from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
+        token = self._get_token()
+        try:
+            file_path = hf_hub_download(
+                repo_id=self.space_name,
+                filename=filename,
+                repo_type="space",
+                token=token,
+                revision=self.revision,
+                local_dir=self.path,
+            )
+        except EntryNotFoundError as e:
+            raise ValueError(
+                f"The file '{filename}' was not found in the space '{self.space_name}'. "
+                f"Please check if the filename is correct, or if it exists in that "
+                f"Huggingface space."
+            ) from e
+        except RepositoryNotFoundError as e:
+            raise ValueError(
+                f"The Huggingface space '{self.space_name}' was not found. "
+                f"Please check if the name is correct and you have access to the space."
+            ) from e
+        return file_path
+    def _download_data(self) -> str:
+        if isinstance(self.data_files, str):
+            data_files = [self.data_files]
+        elif isinstance(self.data_files, Mapping):
+            data_files = list(self.data_files.values())
+        else:
+            data_files = self.data_files
+        for files in data_files:
+            if isinstance(files, str):
+                files = [files]
+            # All files - within the same space - are downloaded into the same base directory:
+            paths = [self._download_file_from_space(file) for file in files]
+            dir_path = paths[0].replace(files[0], "")
+        return dir_path
+    def load_data(self):
+        self.sef_default_data_classification(
+            ["public"], "when loading from Huggingface spaces"
+        )
+        self.path = self._download_data()
+        return super().load_data()

metrics.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 import string
 import uuid
@@ -6,6 +7,7 @@ from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
 from copy import deepcopy
 from dataclasses import field
 from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
@@ -20,10 +22,10 @@ from .dataclass import AbstractField, InternalField, NonPositionalField, Optiona
 from .logging_utils import get_logger
 from .metric_utils import InstanceInput, MetricRequest, MetricResponse
 from .operator import (
     MultiStreamOperator,
-    SingleStreamOperator,
     StreamingOperator,
-    StreamInstanceOperator,
 )
 from .operators import CopyFields
 from .random_utils import get_seed
@@ -68,7 +70,7 @@ def nan_max(x):
         return np.nanmax(x)
-class UpdateStream(StreamInstanceOperator):
     update: dict
     def process(
@@ -94,6 +96,28 @@ class Metric(Artifact):
     # parsing on every use
     _parsed_prediction_type = None
     def _validate_references_and_prediction(self, references, predictions):
         if not isoftype(predictions, List[Any]):
             raise ValueError(
@@ -151,7 +175,7 @@ class Metric(Artifact):
             self._parsed_prediction_type = parse_type_string(self.prediction_type)
         except ValueError:
             raise ValueError(
-                "Could convert prediction type '{self.prediction_type}' in {self.get_metric_name()} to known type.  To enable type checking for this prediction type, open unitxt issue with this message. Alternatively, set the metric's prediction_type to 'Any'"
             ) from None
         return self._parsed_prediction_type
@@ -166,6 +190,7 @@ class Metric(Artifact):
         additional_inputs = []
         instances = []
         for instance in stream:
             references.append(instance["references"])
             predictions.append(instance["prediction"])
             additional_inputs.append(
@@ -421,7 +446,7 @@ class MetricWithConfidenceInterval(Metric):
         return result
-class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     """A class for computing metrics that require joint calculations over all instances and are not just aggregation of scores of individuals instances.
     For example, macro_F1 requires
@@ -445,15 +470,16 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         instances = []
         for instance in stream:
             if "score" not in instance:
-                instance["score"] = {"global": global_score, "instance": {}}
-            else:
-                global_score = instance["score"]["global"]
             instance_references, instance_prediction = (
                 instance["references"],
                 instance["prediction"],
             )
             references.append(instance_references)
             predictions.append(instance_prediction)
             instances.append(instance)
@@ -463,6 +489,7 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             )
             task_data.append(instance_task_data)
             instance_score = None
             # for backward compatibility
             no_score_value = np.nan
             if self.process_single_instances:
@@ -483,13 +510,14 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                 if isinstance(self.main_score, str):
                     instance_score[self.main_score] = no_score_value
-            instance["score"]["instance"].update(instance_score)
         self._validate_references_and_prediction(references, predictions)
         result = self._compute(references, predictions, task_data)
-        global_score.update(result)
         score_name = global_score["score_name"]
         confidence_interval = self.compute_global_confidence_intervals(
             references, predictions, task_data, score_name
@@ -497,7 +525,7 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         global_score.update(confidence_interval)
         for instance in instances:
-            instance["score"]["global"] = global_score
             yield instance
     def _compute(
@@ -531,11 +559,12 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         pass
-class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     n_resamples: int = OptionalField(
         default_factory=lambda: settings.num_resamples_for_instance_metrics
     )
     main_score: str
     reduction_map: Dict[str, List[str]]
     implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
@@ -549,7 +578,9 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             list,
             zip(
                 *[
-                    (instance["references"], instance["prediction"])
                     for instance in stream
                 ]
             ),
@@ -574,12 +605,11 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         for instance, score in zip(stream, instance_scores):
             if "score" not in instance:
-                instance["score"] = {"global": global_score, "instance": {}}
-            else:
-                global_score = instance["score"]["global"]
-            instance["score"]["instance"].update(score)
             instances.append(instance)
         for reduction, fields in self.reduction_map.items():
@@ -589,27 +619,32 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             if reduction == "mean":
                 for field_name in fields:
-                    global_score[field_name] = mean(
                         [
-                            instance["score"]["instance"][field_name]
                             for instance in instances
                         ]
                     )
                     if field_name == self.main_score:
-                        global_score["score"] = global_score[field_name]
-                        global_score["score_name"] = self.main_score
                 ci_fields = (
                     list(set(self.ci_scores))
                     if self.ci_scores is not None
                     else [self.main_score]
                 )
                 confidence_interval = self.score_based_confidence_interval(
-                    instances=instances, score_names=ci_fields
                 )
                 global_score.update(confidence_interval)
         for instance in instances:
             yield instance
     @abstractmethod
@@ -622,7 +657,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         pass
-class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs).
     InstanceMetric currently allows two reductions:
@@ -748,8 +783,8 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             ), f"each instance task_data dict must have a key {self.subgroup_column}"
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        instances, global_score = self.compute_instance_scores(stream)
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
                 reduction_type in self.implemented_reductions
@@ -795,7 +830,9 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             # calculate global scores for each reduction field
             for field_name in reduction_fields:
-                field_name_full = field_name_full_prefix + field_name
                 # if group resampling (3rd element of agg_func parameter) is True, then
                 #   1. scores_to_resample are the group scores, and
                 #   2. aggregation_function is to take the raw mean
@@ -804,7 +841,7 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                 #   2. aggregation_function is to apply the group aggregation from the instance scores
                 # either way, the application of aggregation_function to scores_to_resample yields the global score
                 global_score[field_name_full] = aggregation_function(
-                    scores_to_resample, field_name
                 )
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
@@ -815,21 +852,26 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             if self.ci_scores is not None:
                 confidence_interval = self.score_based_confidence_interval(
                     instances=scores_to_resample,
-                    score_names=list(set(self.ci_scores)),
                     ci_score_prefix=field_name_full_prefix,
                     aggregation_func=aggregation_function,
                 )
                 global_score.update(confidence_interval)
         yield from instances
     def compute_instance_scores(
         self, stream: Stream, stream_name: Optional[str] = None
     ):
-        global_score = {}
         instances = []
         for instance in stream:
             task_data = instance["task_data"] if "task_data" in instance else {}
             if self.reference_field == "references":
@@ -849,18 +891,19 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             instance_score = self.compute(
                 references=refs, prediction=pred, task_data=task_data
             )
             instance_score["score"] = instance_score[self.main_score]
             instance_score["score_name"] = self.main_score
             if "score" not in instance:
-                instance["score"] = {"global": global_score, "instance": {}}
-            else:
-                global_score = instance["score"]["global"]
-            instance["score"]["instance"].update(instance_score)
             instances.append(instance)
-        return instances, global_score
     def get_group_scores(
         self, instances: List[dict], score_names: List[str], group_aggregation_func
@@ -1082,8 +1125,14 @@ class MetricPipeline(MultiStreamOperator, Metric):
         super().prepare()
         self.prepare_score = CopyFields(
             field_to_field=[
-                [f"score/instance/{self.main_score}", "score/instance/score"],
-                [f"score/global/{self.main_score}", "score/global/score"],
             ],
         )
@@ -2098,6 +2147,7 @@ class LlamaIndexCorrectness(InstanceMetric):
     ] = []  # this is here for the sake of documentation for future models
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
     _requirements_list: List[str] = ["llama_index"]
@@ -2179,11 +2229,6 @@ class LlamaIndexCorrectness(InstanceMetric):
         # treat the references as the questions and the predictions as answers
         # assume a single reference
-        assert (
-            not self._model_using_extrnal_api()
-            or settings.allow_passing_data_to_remote_api
-        ), f"Cannot run send data to remote APIs ({self.model_name}) when unitxt.settings.allow_passing_data_to_remote_api=False.  Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this."
         query = task_data["question"]
         contexts = None
@@ -2733,7 +2778,7 @@ class KPA(CustomF1):
         return element == "none"
-class RemoteMetric(SingleStreamOperator, Metric):
     """A metric that runs another metric remotely.
     main_score: the score updated by this metric.
@@ -2746,10 +2791,12 @@ class RemoteMetric(SingleStreamOperator, Metric):
     endpoint: str
     metric_name: str
     api_key: str = None
     @staticmethod
     def wrap_inner_metric_pipeline_metric(
-        metric_pipeline: MetricPipeline, remote_metrics_endpoint: str
     ) -> MetricPipeline:
         """Wrap the inner metric in a MetricPipeline with a RemoteMetric.
@@ -3662,3 +3709,40 @@ class NormalizedSacrebleu(HuggingfaceMetric):
         "mecab_ko": KO_ERROR_MESSAGE,
         "mecab_ko_dic": KO_ERROR_MESSAGE,
     }

+import ast
 import re
 import string
 import uuid
 from collections import Counter, defaultdict
 from copy import deepcopy
 from dataclasses import field
+from operator import itemgetter
 from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
 from .logging_utils import get_logger
 from .metric_utils import InstanceInput, MetricRequest, MetricResponse
 from .operator import (
+    InstanceOperator,
     MultiStreamOperator,
     StreamingOperator,
+    StreamOperator,
 )
 from .operators import CopyFields
 from .random_utils import get_seed
         return np.nanmax(x)
+class UpdateStream(InstanceOperator):
     update: dict
     def process(
     # parsing on every use
     _parsed_prediction_type = None
+    #
+    # Used to add a prefix to all score, except the "score_name" and "score" fields.
+    # This is used to distinguish two scores of the same metrics, operating on different fields of the task
+    #
+    score_prefix: str = ""
+    def _add_score_prefix(self, score_name):
+        return (
+            self.score_prefix + score_name
+            if score_name not in ["score", "score_name"]
+            else score_name
+        )
+    def _add_score_prefixes_to_score_dict(self, scores: Dict[str, Any]):
+        new_scores = {}
+        for score_name, score in scores.items():
+            score_with_prefix = self._add_score_prefix(score_name)
+            new_scores[score_with_prefix] = (
+                score if score_name not in ["score_name"] else self.score_prefix + score
+            )
+        return new_scores
     def _validate_references_and_prediction(self, references, predictions):
         if not isoftype(predictions, List[Any]):
             raise ValueError(
             self._parsed_prediction_type = parse_type_string(self.prediction_type)
         except ValueError:
             raise ValueError(
+                f"Could convert prediction type '{self.prediction_type}' in {self.get_metric_name()} to known type.  To enable type checking for this prediction type, open unitxt issue with this message. Alternatively, set the metric's prediction_type to 'Any'"
             ) from None
         return self._parsed_prediction_type
         additional_inputs = []
         instances = []
         for instance in stream:
+            instance = self.verify_instance(instance)
             references.append(instance["references"])
             predictions.append(instance["prediction"])
             additional_inputs.append(
         return result
+class GlobalMetric(StreamOperator, MetricWithConfidenceInterval):
     """A class for computing metrics that require joint calculations over all instances and are not just aggregation of scores of individuals instances.
     For example, macro_F1 requires
         instances = []
         for instance in stream:
+            instance = self.verify_instance(instance)
             if "score" not in instance:
+                instance["score"] = {"global": {}, "instance": {}}
             instance_references, instance_prediction = (
                 instance["references"],
                 instance["prediction"],
             )
             references.append(instance_references)
             predictions.append(instance_prediction)
             instances.append(instance)
             )
             task_data.append(instance_task_data)
             instance_score = None
             # for backward compatibility
             no_score_value = np.nan
             if self.process_single_instances:
                 if isinstance(self.main_score, str):
                     instance_score[self.main_score] = no_score_value
+            instance["score"]["instance"].update(
+                self._add_score_prefixes_to_score_dict(instance_score)
+            )
         self._validate_references_and_prediction(references, predictions)
         result = self._compute(references, predictions, task_data)
+        global_score.update(self._add_score_prefixes_to_score_dict(result))
         score_name = global_score["score_name"]
         confidence_interval = self.compute_global_confidence_intervals(
             references, predictions, task_data, score_name
         global_score.update(confidence_interval)
         for instance in instances:
+            instance["score"]["global"].update(global_score)
             yield instance
     def _compute(
         pass
+class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval):
     n_resamples: int = OptionalField(
         default_factory=lambda: settings.num_resamples_for_instance_metrics
     )
     main_score: str
     reduction_map: Dict[str, List[str]]
     implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
             list,
             zip(
                 *[
+                    itemgetter("references", "prediction")(
+                        self.verify_instance(instance)
+                    )
                     for instance in stream
                 ]
             ),
         for instance, score in zip(stream, instance_scores):
             if "score" not in instance:
+                instance["score"] = {"global": {}, "instance": {}}
+            instance["score"]["instance"].update(
+                self._add_score_prefixes_to_score_dict(score)
+            )
             instances.append(instance)
         for reduction, fields in self.reduction_map.items():
             if reduction == "mean":
                 for field_name in fields:
+                    field_name_with_prefix = self._add_score_prefix(field_name)
+                    global_score[field_name_with_prefix] = mean(
                         [
+                            instance["score"]["instance"][field_name_with_prefix]
                             for instance in instances
                         ]
                     )
                     if field_name == self.main_score:
+                        global_score["score"] = global_score[field_name_with_prefix]
+                        global_score["score_name"] = self.score_prefix + self.main_score
                 ci_fields = (
                     list(set(self.ci_scores))
                     if self.ci_scores is not None
                     else [self.main_score]
                 )
+                ci_fields_with_prefix = [
+                    self._add_score_prefix(ci_field) for ci_field in ci_fields
+                ]
                 confidence_interval = self.score_based_confidence_interval(
+                    instances=instances, score_names=ci_fields_with_prefix
                 )
                 global_score.update(confidence_interval)
         for instance in instances:
+            instance["score"]["global"].update(global_score)
             yield instance
     @abstractmethod
         pass
+class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
     """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs).
     InstanceMetric currently allows two reductions:
             ), f"each instance task_data dict must have a key {self.subgroup_column}"
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        instances = self.compute_instance_scores(stream)
+        global_score = {}
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
                 reduction_type in self.implemented_reductions
             # calculate global scores for each reduction field
             for field_name in reduction_fields:
+                field_name_full = (
+                    field_name_full_prefix + self.score_prefix + field_name
+                )
                 # if group resampling (3rd element of agg_func parameter) is True, then
                 #   1. scores_to_resample are the group scores, and
                 #   2. aggregation_function is to take the raw mean
                 #   2. aggregation_function is to apply the group aggregation from the instance scores
                 # either way, the application of aggregation_function to scores_to_resample yields the global score
                 global_score[field_name_full] = aggregation_function(
+                    scores_to_resample, self.score_prefix + field_name
                 )
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
             if self.ci_scores is not None:
                 confidence_interval = self.score_based_confidence_interval(
                     instances=scores_to_resample,
+                    score_names=[
+                        self.score_prefix + ci_score for ci_score in set(self.ci_scores)
+                    ],
                     ci_score_prefix=field_name_full_prefix,
                     aggregation_func=aggregation_function,
                 )
                 global_score.update(confidence_interval)
+        for instance in instances:
+            instance["score"]["global"].update(global_score)
         yield from instances
     def compute_instance_scores(
         self, stream: Stream, stream_name: Optional[str] = None
     ):
         instances = []
         for instance in stream:
+            instance = self.verify_instance(instance)
             task_data = instance["task_data"] if "task_data" in instance else {}
             if self.reference_field == "references":
             instance_score = self.compute(
                 references=refs, prediction=pred, task_data=task_data
             )
             instance_score["score"] = instance_score[self.main_score]
             instance_score["score_name"] = self.main_score
             if "score" not in instance:
+                instance["score"] = {"global": {}, "instance": {}}
+            instance["score"]["instance"].update(
+                self._add_score_prefixes_to_score_dict(instance_score)
+            )
             instances.append(instance)
+        return instances
     def get_group_scores(
         self, instances: List[dict], score_names: List[str], group_aggregation_func
         super().prepare()
         self.prepare_score = CopyFields(
             field_to_field=[
+                [
+                    f"score/instance/{self.metric._add_score_prefix(self.main_score)}",
+                    "score/instance/score",
+                ],
+                [
+                    f"score/global/{self.metric._add_score_prefix(self.main_score)}",
+                    "score/global/score",
+                ],
             ],
         )
     ] = []  # this is here for the sake of documentation for future models
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
+    data_classification_policy = ["public"]
     _requirements_list: List[str] = ["llama_index"]
         # treat the references as the questions and the predictions as answers
         # assume a single reference
         query = task_data["question"]
         contexts = None
         return element == "none"
+class RemoteMetric(StreamOperator, Metric):
     """A metric that runs another metric remotely.
     main_score: the score updated by this metric.
     endpoint: str
     metric_name: str
     api_key: str = None
+    data_classification_policy = ["public", "proprietary"]
     @staticmethod
     def wrap_inner_metric_pipeline_metric(
+        metric_pipeline: MetricPipeline,
+        remote_metrics_endpoint: str,
     ) -> MetricPipeline:
         """Wrap the inner metric in a MetricPipeline with a RemoteMetric.
         "mecab_ko": KO_ERROR_MESSAGE,
         "mecab_ko_dic": KO_ERROR_MESSAGE,
     }
+class CustomF1Fuzzy(CustomF1):
+    def calculate_groups_ratio(self, actual_group, total_group):
+        from fuzzywuzzy import fuzz
+        tmp = []
+        for actual_key in actual_group.keys():
+            max_score = self.fuzz_ratio
+            best_total_key = None
+            for total_key in total_group.keys():
+                tup_ac = ast.literal_eval(actual_key)
+                tup_to = ast.literal_eval(total_key)
+                if tup_ac[1] == tup_to[1]:
+                    score = fuzz.ratio(tup_ac[0], tup_to[0])
+                    if score > max_score:
+                        max_score = score
+                        best_total_key = total_key
+            if best_total_key is not None:
+                tmp.append(min(actual_group[actual_key], total_group[best_total_key]))
+            else:
+                tmp.append(min(actual_group[actual_key], 0))
+        return sum(tmp), sum(actual_group.values())
+class FuzzyNer(CustomF1Fuzzy):
+    prediction_type = "List[Tuple[str,str]]"
+    fuzz_ratio = 75
+    def get_element_group(self, element, additional_input):
+        return element[1]
+    def get_element_representation(self, element, additional_input):
+        return str(element)

normalizers.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from typing import Any, Dict, List, Optional
-from .operator import StreamInstanceOperator
-class NormalizeListFields(StreamInstanceOperator):
     fields: List[str]
     key_prefix: str = ""
     empty_value: str = ""

 from typing import Any, Dict, List, Optional
+from .operator import InstanceOperator
+class NormalizeListFields(InstanceOperator):
     fields: List[str]
     key_prefix: str = ""
     empty_value: str = ""

operator.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import re
 from abc import abstractmethod
 from dataclasses import field
 from typing import Any, Dict, Generator, List, Optional, Union
@@ -208,12 +207,13 @@ class MultiStreamOperator(StreamingOperator):
         pass
     def process_instance(self, instance, stream_name="tmp"):
         multi_stream = MultiStream({stream_name: stream_single(instance)})
         processed_multi_stream = self(multi_stream)
         return next(iter(processed_multi_stream[stream_name]))
-class SingleStreamOperator(MultiStreamOperator):
     """A class representing a single-stream operator in the streaming system.
     A single-stream operator is a type of `MultiStreamOperator` that operates on individual
@@ -236,9 +236,7 @@ class SingleStreamOperator(MultiStreamOperator):
                 stream = self._process_single_stream(stream, stream_name)
             else:
                 stream = stream
-            assert isinstance(
-                stream, Stream
-            ), "SingleStreamOperator must return a Stream"
             result[stream_name] = stream
         return MultiStream(result)
@@ -279,16 +277,21 @@ class SingleStreamOperator(MultiStreamOperator):
         pass
     def process_instance(self, instance, stream_name="tmp"):
         processed_stream = self._process_single_stream(
             stream_single(instance), stream_name
         )
         return next(iter(processed_stream))
-class PagedStreamOperator(SingleStreamOperator):
     """A class representing a paged-stream operator in the streaming system.
-    A paged-stream operator is a type of `SingleStreamOperator` that operates on a page of instances
     in a `Stream` at a time, where a page is a subset of instances.
     The `process` method should be implemented by subclasses to define the specific operations
     to be performed on each page.
@@ -320,6 +323,7 @@ class PagedStreamOperator(SingleStreamOperator):
         pass
     def process_instance(self, instance, stream_name="tmp"):
         processed_stream = self._process_page([instance], stream_name)
         return next(iter(processed_stream))
@@ -343,10 +347,10 @@ class SingleStreamReducer(StreamingOperator):
         pass
-class StreamInstanceOperator(SingleStreamOperator):
     """A class representing a stream instance operator in the streaming system.
-    A stream instance operator is a type of `SingleStreamOperator` that operates on individual instances within a `Stream`. It iterates through each instance in the `Stream` and applies the `process` method. The `process` method should be implemented by subclasses to define the specific operations to be performed on each instance.
     """
     def _process_stream(
@@ -367,6 +371,7 @@ class StreamInstanceOperator(SingleStreamOperator):
     def _process_instance(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         return self.process(instance, stream_name)
     @abstractmethod
@@ -379,10 +384,10 @@ class StreamInstanceOperator(SingleStreamOperator):
         return self._process_instance(instance, stream_name)
-class StreamInstanceOperatorValidator(StreamInstanceOperator):
     """A class representing a stream instance operator validator in the streaming system.
-    A stream instance operator validator is a type of `StreamInstanceOperator` that includes a validation step. It operates on individual instances within a `Stream` and validates the result of processing each instance.
     """
     @abstractmethod
@@ -405,20 +410,6 @@ class StreamInstanceOperatorValidator(StreamInstanceOperator):
         )
-class InstanceOperator(Artifact):
-    """A class representing an instance operator in the streaming system.
-    An instance operator is a type of `Artifact` that operates on a single instance (represented as a dict) at a time. It takes an instance as input and produces a transformed instance as output.
-    """
-    def __call__(self, data: dict) -> dict:
-        return self.process(data)
-    @abstractmethod
-    def process(self, data: dict) -> dict:
-        pass
 class BaseFieldOperator(Artifact):
     """A class representing a field operator in the streaming system.
@@ -426,6 +417,7 @@ class BaseFieldOperator(Artifact):
     """
     def __call__(self, data: Dict[str, Any], field: str) -> dict:
         value = self.process(data[field])
         data[field] = value
         return data
@@ -456,7 +448,10 @@ class InstanceOperatorWithMultiStreamAccess(StreamingOperator):
         return MultiStream(result)
     def generator(self, stream, multi_stream):
-        yield from (self.process(instance, multi_stream) for instance in stream)
     @abstractmethod
     def process(self, instance: dict, multi_stream: MultiStream) -> dict:
@@ -488,8 +483,7 @@ class SequentialOperator(MultiStreamOperator):
         last_step = (
             self.max_steps - 1 if self.max_steps is not None else len(self.steps) - 1
         )
-        description = str(self.steps[last_step])
-        return re.sub(r"\w+=None, ", "", description)
     def _get_max_steps(self):
         return self.max_steps if self.max_steps is not None else len(self.steps)

 from abc import abstractmethod
 from dataclasses import field
 from typing import Any, Dict, Generator, List, Optional, Union
         pass
     def process_instance(self, instance, stream_name="tmp"):
+        instance = self.verify_instance(instance)
         multi_stream = MultiStream({stream_name: stream_single(instance)})
         processed_multi_stream = self(multi_stream)
         return next(iter(processed_multi_stream[stream_name]))
+class StreamOperator(MultiStreamOperator):
     """A class representing a single-stream operator in the streaming system.
     A single-stream operator is a type of `MultiStreamOperator` that operates on individual
                 stream = self._process_single_stream(stream, stream_name)
             else:
                 stream = stream
+            assert isinstance(stream, Stream), "StreamOperator must return a Stream"
             result[stream_name] = stream
         return MultiStream(result)
         pass
     def process_instance(self, instance, stream_name="tmp"):
+        instance = self.verify_instance(instance)
         processed_stream = self._process_single_stream(
             stream_single(instance), stream_name
         )
         return next(iter(processed_stream))
+class SingleStreamOperator(StreamOperator):
+    pass
+class PagedStreamOperator(StreamOperator):
     """A class representing a paged-stream operator in the streaming system.
+    A paged-stream operator is a type of `StreamOperator` that operates on a page of instances
     in a `Stream` at a time, where a page is a subset of instances.
     The `process` method should be implemented by subclasses to define the specific operations
     to be performed on each page.
         pass
     def process_instance(self, instance, stream_name="tmp"):
+        instance = self.verify_instance(instance)
         processed_stream = self._process_page([instance], stream_name)
         return next(iter(processed_stream))
         pass
+class InstanceOperator(StreamOperator):
     """A class representing a stream instance operator in the streaming system.
+    A stream instance operator is a type of `StreamOperator` that operates on individual instances within a `Stream`. It iterates through each instance in the `Stream` and applies the `process` method. The `process` method should be implemented by subclasses to define the specific operations to be performed on each instance.
     """
     def _process_stream(
     def _process_instance(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
+        instance = self.verify_instance(instance)
         return self.process(instance, stream_name)
     @abstractmethod
         return self._process_instance(instance, stream_name)
+class InstanceOperatorValidator(InstanceOperator):
     """A class representing a stream instance operator validator in the streaming system.
+    A stream instance operator validator is a type of `InstanceOperator` that includes a validation step. It operates on individual instances within a `Stream` and validates the result of processing each instance.
     """
     @abstractmethod
         )
 class BaseFieldOperator(Artifact):
     """A class representing a field operator in the streaming system.
     """
     def __call__(self, data: Dict[str, Any], field: str) -> dict:
+        data = self.verify_instance(data)
         value = self.process(data[field])
         data[field] = value
         return data
         return MultiStream(result)
     def generator(self, stream, multi_stream):
+        yield from (
+            self.process(self.verify_instance(instance), multi_stream)
+            for instance in stream
+        )
     @abstractmethod
     def process(self, instance: dict, multi_stream: MultiStream) -> dict:
         last_step = (
             self.max_steps - 1 if self.max_steps is not None else len(self.steps) - 1
         )
+        return self.steps[last_step].__description__
     def _get_max_steps(self):
         return self.max_steps if self.max_steps is not None else len(self.steps)

operators.py CHANGED Viewed

@@ -29,9 +29,10 @@ Other specelized operators are used by unitxt internally:
 The rest of this section is dedicated for general operators.
-General Operaotrs List:
 ------------------------
 """
 import copy
 import operator
 import uuid
@@ -60,18 +61,18 @@ from .artifact import Artifact, fetch_artifact
 from .dataclass import DeprecatedField, NonPositionalField, OptionalField
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
 from .operator import (
     MultiStream,
     MultiStreamOperator,
     PackageRequirementsMixin,
     PagedStreamOperator,
     SequentialOperator,
     SideEffectOperator,
-    SingleStreamOperator,
     SingleStreamReducer,
     SourceOperator,
     StreamingOperator,
     StreamInitializerOperator,
-    StreamInstanceOperator,
 )
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
@@ -116,10 +117,10 @@ class IterableSource(SourceOperator):
         return MultiStream.from_iterables(self.iterables)
-class MapInstanceValues(StreamInstanceOperator):
     """A class used to map instance values into other values.
-    This class is a type of StreamInstanceOperator,
     it maps values of instances in a stream using predefined mappers.
     Attributes:
@@ -138,7 +139,7 @@ class MapInstanceValues(StreamInstanceOperator):
         replaces '1' with 'hi' and '2' with 'bye' in field 'a' in all instances of all streams:
         instance {"a":"1", "b": 2} becomes {"a":"hi", "b": 2}.
-        MapInstanceValues(mappers={"a": {"1": "hi", "2": "bye"}}, process_every_element=True)
         Assuming field 'a' is a list of values, potentially including "1"-s and "2"-s, this replaces
         each such "1" with "hi" and "2" -- with "bye" in all instances of all streams:
         instance {"a": ["1", "2"], "b": 2} becomes {"a": ["hi", "bye"], "b": 2}.
@@ -204,7 +205,7 @@ class MapInstanceValues(StreamInstanceOperator):
         return val
-class FlattenInstances(StreamInstanceOperator):
     """Flattens each instance in a stream, making nested dictionary entries into top-level entries.
     Args:
@@ -221,7 +222,7 @@ class FlattenInstances(StreamInstanceOperator):
         return flatten_dict(instance, parent_key=self.parent_key, sep=self.sep)
-class AddFields(StreamInstanceOperator):
     """Adds specified fields to each instance in a given stream or all streams (default) If fields exist, updates them.
     Args:
@@ -264,7 +265,7 @@ class AddFields(StreamInstanceOperator):
         return instance
-class RemoveFields(StreamInstanceOperator):
     """Remove specified fields from each instance in a stream.
     Args:
@@ -281,7 +282,7 @@ class RemoveFields(StreamInstanceOperator):
         return instance
-class InstanceFieldOperator(StreamInstanceOperator):
     """A general stream instance operator that processes the values of a field (or multiple ones).
     Args:
@@ -393,6 +394,11 @@ class InstanceFieldOperator(StreamInstanceOperator):
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         for from_field, to_field in self._field_to_field:
             try:
                 old_value = dict_get(
@@ -485,7 +491,7 @@ class AddConstant(FieldOperator):
         return self.add + value
-class Augmentor(StreamInstanceOperator):
     """A stream operator that augments the values of either the task input fields before rendering with the template,  or the input passed to the model after rendering of the template.
     Args:
@@ -732,7 +738,7 @@ class JoinStr(FieldOperator):
         return self.separator.join(str(x) for x in value)
-class Apply(StreamInstanceOperator):
     """A class used to apply a python function and store the result in a field.
     Args:
@@ -802,7 +808,7 @@ class Apply(StreamInstanceOperator):
         return instance
-class ListFieldValues(StreamInstanceOperator):
     """Concatenates values of multiple fields into a list, and assigns it to a new field."""
     fields: List[str]
@@ -824,7 +830,7 @@ class ListFieldValues(StreamInstanceOperator):
         return instance
-class ZipFieldValues(StreamInstanceOperator):
     """Zips values of multiple fields in a given instance, similar to list(zip(*fields)).
     The value in each of the specified 'fields' is assumed to be a list. The lists from all 'fields'
@@ -860,7 +866,7 @@ class ZipFieldValues(StreamInstanceOperator):
         return instance
-class InterleaveListsToDialogOperator(StreamInstanceOperator):
     """Interleaves two lists, one of user dialog turns and one of assistant dialog turns, into a single list of tuples, alternating between "user" and "assistant".
      The list of tuples if of format (role, turn_content), where the role label is specified by
@@ -905,7 +911,7 @@ class InterleaveListsToDialogOperator(StreamInstanceOperator):
         return instance
-class IndexOf(StreamInstanceOperator):
     """For a given instance, finds the offset of value of field 'index_of', within the value of field 'search_in'."""
     search_in: str
@@ -927,7 +933,7 @@ class IndexOf(StreamInstanceOperator):
         return instance
-class TakeByField(StreamInstanceOperator):
     """From field 'field' of a given instance, select the member indexed by field 'index', and store to field 'to_field'."""
     field: str
@@ -1034,7 +1040,7 @@ class GetItemByIndex(FieldOperator):
         return self.items_list[value]
-class AddID(StreamInstanceOperator):
     """Stores a unique id value in the designated 'id_field_name' field of the given instance."""
     id_field_name: str = "id"
@@ -1046,7 +1052,7 @@ class AddID(StreamInstanceOperator):
         return instance
-class CastFields(StreamInstanceOperator):
     """Casts specified fields to specified types.
     Args:
@@ -1106,7 +1112,7 @@ class CastFields(StreamInstanceOperator):
         return instance
-class DivideAllFieldsBy(StreamInstanceOperator):
     """Recursively reach down to all fields that are float, and divide each by 'divisor'.
     The given instance is viewed as a tree whose internal nodes are dictionaries and lists, and
@@ -1165,7 +1171,7 @@ class ArtifactFetcherMixin:
         return cls.cache[artifact_identifier]
-class ApplyOperatorsField(StreamInstanceOperator):
     """Applies value operators to each instance in a stream based on specified fields.
     Args:
@@ -1206,7 +1212,7 @@ class ApplyOperatorsField(StreamInstanceOperator):
         return operator.process_instance(instance)
-class FilterByCondition(SingleStreamOperator):
     """Filters a stream, yielding only instances in which the values in required fields follow the required condition operator.
     Raises an error if a required field name is missing from the input instance.
@@ -1322,7 +1328,7 @@ class ComputeExpressionMixin(Artifact):
         )
-class FilterByExpression(SingleStreamOperator, ComputeExpressionMixin):
     """Filters a stream, yielding only instances which fulfil a condition specified as a string to be python's eval-uated.
     Raises an error if a field participating in the specified condition is missing from the instance
@@ -1337,9 +1343,7 @@ class FilterByExpression(SingleStreamOperator, ComputeExpressionMixin):
        FilterByExpression(expression = "a <= 4 and b > 5") will yield only instances where the value of field "a" is not exceeding 4 and in field "b" -- greater than 5
        FilterByExpression(expression = "a in [4, 8]") will yield only instances where "a" is 4 or 8
        FilterByExpression(expression = "a not in [4, 8]") will yield only instances where "a" is neither 4 nor 8
-       FilterByExpression(expression = "a['b'] not in [4, 8]") will yield only instances where "a" is a dict in
-                                        which key 'b' is mapped to a value that is neither 4 nor 8
     """
     error_on_filtered_all: bool = True
@@ -1357,7 +1361,7 @@ class FilterByExpression(SingleStreamOperator, ComputeExpressionMixin):
             )
-class ExecuteExpression(StreamInstanceOperator, ComputeExpressionMixin):
     """Compute an expression, specified as a string to be eval-uated, over the instance's fields, and store the result in field to_field.
     Raises an error if a field mentioned in the query is missing from the instance.
@@ -1651,7 +1655,7 @@ class SplitByNestedGroup(MultiStreamOperator):
         return MultiStream.from_iterables(result)
-class ApplyStreamOperatorsField(SingleStreamOperator, ArtifactFetcherMixin):
     """Applies stream operators to a stream based on specified fields in each instance.
     Args:
@@ -1676,14 +1680,14 @@ class ApplyStreamOperatorsField(SingleStreamOperator, ArtifactFetcherMixin):
             operator = self.get_artifact(operator_name)
             assert isinstance(
                 operator, StreamingOperator
-            ), f"Operator {operator_name} must be a SingleStreamOperator"
             stream = operator(MultiStream({"tmp": stream}))["tmp"]
         yield from stream
-class ApplyMetric(SingleStreamOperator, ArtifactFetcherMixin):
     """Applies metric operators to a stream based on a metric field specified in each instance.
     Args:
@@ -1855,7 +1859,7 @@ class FeatureGroupedShuffle(Shuffle):
         return list(itertools.chain(*page_blocks))
-class EncodeLabels(StreamInstanceOperator):
     """Encode each value encountered in any field in 'fields' into the integers 0,1,...
     Encoding is determined by a str->int map that is built on the go, as different values are
@@ -1908,7 +1912,7 @@ class EncodeLabels(StreamInstanceOperator):
         return instance
-class StreamRefiner(SingleStreamOperator):
     """Discard from the input stream all instances beyond the leading 'max_instances' instances.
     Thereby, if the input stream consists of no more than 'max_instances' instances, the resulting stream is the whole of the
@@ -1987,6 +1991,80 @@ class DeterministicBalancer(StreamRefiner):
                 yield instance
 class LengthBalancer(DeterministicBalancer):
     """Balances by a signature that reflects the total length of the fields' values, quantized into integer segments.
@@ -2071,7 +2149,7 @@ class ExtractZipFile(SideEffectOperator):
             zf.extractall(self.target_dir)
-class DuplicateInstances(SingleStreamOperator):
     """Operator which duplicates each instance in stream a given number of times.
     Attributes:

 The rest of this section is dedicated for general operators.
+General Operators List:
 ------------------------
 """
 import copy
 import operator
 import uuid
 from .dataclass import DeprecatedField, NonPositionalField, OptionalField
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
 from .operator import (
+    InstanceOperator,
     MultiStream,
     MultiStreamOperator,
     PackageRequirementsMixin,
     PagedStreamOperator,
     SequentialOperator,
     SideEffectOperator,
     SingleStreamReducer,
     SourceOperator,
     StreamingOperator,
     StreamInitializerOperator,
+    StreamOperator,
 )
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
         return MultiStream.from_iterables(self.iterables)
+class MapInstanceValues(InstanceOperator):
     """A class used to map instance values into other values.
+    This class is a type of InstanceOperator,
     it maps values of instances in a stream using predefined mappers.
     Attributes:
         replaces '1' with 'hi' and '2' with 'bye' in field 'a' in all instances of all streams:
         instance {"a":"1", "b": 2} becomes {"a":"hi", "b": 2}.
+        MapInstanceValues(mappers={"a": {"1": "hi", "2": "bye"}}, process_every_value=True)
         Assuming field 'a' is a list of values, potentially including "1"-s and "2"-s, this replaces
         each such "1" with "hi" and "2" -- with "bye" in all instances of all streams:
         instance {"a": ["1", "2"], "b": 2} becomes {"a": ["hi", "bye"], "b": 2}.
         return val
+class FlattenInstances(InstanceOperator):
     """Flattens each instance in a stream, making nested dictionary entries into top-level entries.
     Args:
         return flatten_dict(instance, parent_key=self.parent_key, sep=self.sep)
+class AddFields(InstanceOperator):
     """Adds specified fields to each instance in a given stream or all streams (default) If fields exist, updates them.
     Args:
         return instance
+class RemoveFields(InstanceOperator):
     """Remove specified fields from each instance in a stream.
     Args:
         return instance
+class InstanceFieldOperator(InstanceOperator):
     """A general stream instance operator that processes the values of a field (or multiple ones).
     Args:
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
+        # Need to deep copy instance, because when assigning two dictionary fields,
+        # dict_set() the target field dictionary fields.
+        # This means that if this target field was assigned to another field before,
+        # the field is updated as well.
+        instance = deepcopy(instance)
         for from_field, to_field in self._field_to_field:
             try:
                 old_value = dict_get(
         return self.add + value
+class Augmentor(InstanceOperator):
     """A stream operator that augments the values of either the task input fields before rendering with the template,  or the input passed to the model after rendering of the template.
     Args:
         return self.separator.join(str(x) for x in value)
+class Apply(InstanceOperator):
     """A class used to apply a python function and store the result in a field.
     Args:
         return instance
+class ListFieldValues(InstanceOperator):
     """Concatenates values of multiple fields into a list, and assigns it to a new field."""
     fields: List[str]
         return instance
+class ZipFieldValues(InstanceOperator):
     """Zips values of multiple fields in a given instance, similar to list(zip(*fields)).
     The value in each of the specified 'fields' is assumed to be a list. The lists from all 'fields'
         return instance
+class InterleaveListsToDialogOperator(InstanceOperator):
     """Interleaves two lists, one of user dialog turns and one of assistant dialog turns, into a single list of tuples, alternating between "user" and "assistant".
      The list of tuples if of format (role, turn_content), where the role label is specified by
         return instance
+class IndexOf(InstanceOperator):
     """For a given instance, finds the offset of value of field 'index_of', within the value of field 'search_in'."""
     search_in: str
         return instance
+class TakeByField(InstanceOperator):
     """From field 'field' of a given instance, select the member indexed by field 'index', and store to field 'to_field'."""
     field: str
         return self.items_list[value]
+class AddID(InstanceOperator):
     """Stores a unique id value in the designated 'id_field_name' field of the given instance."""
     id_field_name: str = "id"
         return instance
+class CastFields(InstanceOperator):
     """Casts specified fields to specified types.
     Args:
         return instance
+class DivideAllFieldsBy(InstanceOperator):
     """Recursively reach down to all fields that are float, and divide each by 'divisor'.
     The given instance is viewed as a tree whose internal nodes are dictionaries and lists, and
         return cls.cache[artifact_identifier]
+class ApplyOperatorsField(InstanceOperator):
     """Applies value operators to each instance in a stream based on specified fields.
     Args:
         return operator.process_instance(instance)
+class FilterByCondition(StreamOperator):
     """Filters a stream, yielding only instances in which the values in required fields follow the required condition operator.
     Raises an error if a required field name is missing from the input instance.
         )
+class FilterByExpression(StreamOperator, ComputeExpressionMixin):
     """Filters a stream, yielding only instances which fulfil a condition specified as a string to be python's eval-uated.
     Raises an error if a field participating in the specified condition is missing from the instance
        FilterByExpression(expression = "a <= 4 and b > 5") will yield only instances where the value of field "a" is not exceeding 4 and in field "b" -- greater than 5
        FilterByExpression(expression = "a in [4, 8]") will yield only instances where "a" is 4 or 8
        FilterByExpression(expression = "a not in [4, 8]") will yield only instances where "a" is neither 4 nor 8
+       FilterByExpression(expression = "a['b'] not in [4, 8]") will yield only instances where "a" is a dict in which key 'b' is mapped to a value that is neither 4 nor 8
     """
     error_on_filtered_all: bool = True
             )
+class ExecuteExpression(InstanceOperator, ComputeExpressionMixin):
     """Compute an expression, specified as a string to be eval-uated, over the instance's fields, and store the result in field to_field.
     Raises an error if a field mentioned in the query is missing from the instance.
         return MultiStream.from_iterables(result)
+class ApplyStreamOperatorsField(StreamOperator, ArtifactFetcherMixin):
     """Applies stream operators to a stream based on specified fields in each instance.
     Args:
             operator = self.get_artifact(operator_name)
             assert isinstance(
                 operator, StreamingOperator
+            ), f"Operator {operator_name} must be a StreamOperator"
             stream = operator(MultiStream({"tmp": stream}))["tmp"]
         yield from stream
+class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
     """Applies metric operators to a stream based on a metric field specified in each instance.
     Args:
         return list(itertools.chain(*page_blocks))
+class EncodeLabels(InstanceOperator):
     """Encode each value encountered in any field in 'fields' into the integers 0,1,...
     Encoding is determined by a str->int map that is built on the go, as different values are
         return instance
+class StreamRefiner(StreamOperator):
     """Discard from the input stream all instances beyond the leading 'max_instances' instances.
     Thereby, if the input stream consists of no more than 'max_instances' instances, the resulting stream is the whole of the
                 yield instance
+class MinimumOneExamplePerLabelRefiner(StreamRefiner):
+    """A class used to return a specified number instances ensuring at least one example  per label.
+    For each instance, a signature value is constructed from the values of the instance in specified input 'fields'.
+    MinimumOneExamplePerLabelRefiner takes first instance that appears from each label (each unique signature), and then adds more elements up to the max_instances limit.  In general, the refiner takes the first elements in the stream that meet the required conditions.
+    MinimumOneExamplePerLabelRefiner then shuffles the results to avoid having one instance
+    from each class first and then the rest . If max instance is not set, the original stream will be used
+    Attributes:
+        fields (List[str]): A list of field names to be used in producing the instance's signature.
+        max_instances (Optional, int): Number of elements to select. Note that max_instances of StreamRefiners that are passed to the recipe (e.g. 'train_refiner'. `test_refiner`) are overridden by the recipe parameters ( `max_train_instances`, `max_test_instances`)
+    Usage:
+        balancer = MinimumOneExamplePerLabelRefiner(fields=["field1", "field2"], max_instances=200)
+        balanced_stream = balancer.process(stream)
+    Example:
+        When input [{"a": 1, "b": 1},{"a": 1, "b": 2},{"a": 1, "b": 3},{"a": 1, "b": 4},{"a": 2, "b": 5}] is fed into
+        MinimumOneExamplePerLabelRefiner(fields=["a"], max_instances=3)
+        the resulting stream will be:
+        [{'a': 1, 'b': 1}, {'a': 1, 'b': 2}, {'a': 2, 'b': 5}] (order may be different)
+    """
+    fields: List[str]
+    def signature(self, instance):
+        return str(tuple(dict_get(instance, field) for field in self.fields))
+    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        if self.max_instances is None:
+            for instance in stream:
+                yield instance
+        counter = Counter()
+        for instance in stream:
+            counter[self.signature(instance)] += 1
+        all_keys = counter.keys()
+        if len(counter) == 0:
+            return
+        if self.max_instances is not None and len(all_keys) > self.max_instances:
+            raise Exception(
+                f"Can not generate a stream with at least one example per label, because the max instances requested  {self.max_instances} is smaller than the number of different labels {len(all_keys)}"
+                f" ({len(all_keys)}"
+            )
+        counter = Counter()
+        used_indices = set()
+        selected_elements = []
+        # select at least one per class
+        for idx, instance in enumerate(stream):
+            sign = self.signature(instance)
+            if counter[sign] == 0:
+                counter[sign] += 1
+                used_indices.add(idx)
+                selected_elements.append(
+                    instance
+                )  # collect all elements first to allow shuffling of both groups
+        # select more to reach self.max_instances examples
+        for idx, instance in enumerate(stream):
+            if idx not in used_indices:
+                if self.max_instances is None or len(used_indices) < self.max_instances:
+                    used_indices.add(idx)
+                    selected_elements.append(
+                        instance
+                    )  # collect all elements first to allow shuffling of both groups
+        # shuffle elements to avoid having one element from each class appear first
+        random_generator = new_random_generator(sub_seed=selected_elements)
+        random_generator.shuffle(selected_elements)
+        yield from selected_elements
 class LengthBalancer(DeterministicBalancer):
     """Balances by a signature that reflects the total length of the fields' values, quantized into integer segments.
             zf.extractall(self.target_dir)
+class DuplicateInstances(StreamOperator):
     """Operator which duplicates each instance in stream a given number of times.
     Attributes:

schema.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional
 from datasets import Features, Sequence, Value
-from .operator import StreamInstanceOperatorValidator
 UNITXT_DATASET_SCHEMA = Features(
     {
@@ -15,20 +15,12 @@ UNITXT_DATASET_SCHEMA = Features(
         "group": Value("string"),
         "postprocessors": Sequence(Value("string")),
         "task_data": Value(dtype="string"),
     }
 )
-# UNITXT_METRIC_SCHEMA = Features({
-#     "predictions": Value("string", id="sequence"),
-#     "target": Value("string", id="sequence"),
-#     "references": Value("string", id="sequence"),
-#     "metrics": Value("string", id="sequence"),
-#     'group': Value('string'),
-#     'postprocessors': Value("string", id="sequence"),
-# })
-class ToUnitxtGroup(StreamInstanceOperatorValidator):
     group: str
     metrics: List[str] = None
     postprocessors: List[str] = field(default_factory=lambda: ["to_string_stripped"])

 from datasets import Features, Sequence, Value
+from .operator import InstanceOperatorValidator
 UNITXT_DATASET_SCHEMA = Features(
     {
         "group": Value("string"),
         "postprocessors": Sequence(Value("string")),
         "task_data": Value(dtype="string"),
+        "data_classification_policy": Sequence(Value("string")),
     }
 )
+class ToUnitxtGroup(InstanceOperatorValidator):
     group: str
     metrics: List[str] = None
     postprocessors: List[str] = field(default_factory=lambda: ["to_string_stripped"])

settings_utils.py CHANGED Viewed

@@ -128,12 +128,12 @@ if Settings.is_uninitilized():
     settings.default_recipe = "standard_recipe"
     settings.default_verbosity = "info"
     settings.remote_metrics = []
-    settings.allow_passing_data_to_remote_api = (bool, False)
     settings.test_card_disable = (bool, False)
     settings.test_metric_disable = (bool, False)
     settings.metrics_master_key_token = None
     settings.seed = (int, 42)
     settings.skip_artifacts_prepare_and_verify = (bool, False)
 if Constants.is_uninitilized():
     constants = Constants()

     settings.default_recipe = "standard_recipe"
     settings.default_verbosity = "info"
     settings.remote_metrics = []
     settings.test_card_disable = (bool, False)
     settings.test_metric_disable = (bool, False)
     settings.metrics_master_key_token = None
     settings.seed = (int, 42)
     settings.skip_artifacts_prepare_and_verify = (bool, False)
+    settings.data_classification_policy = None
 if Constants.is_uninitilized():
     constants = Constants()

span_lableing_operators.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from typing import Any, Dict, List, Optional
-from .operator import StreamInstanceOperator
-class IobExtractor(StreamInstanceOperator):
     """A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location.
     Attributes:

 from typing import Any, Dict, List, Optional
+from .operator import InstanceOperator
+class IobExtractor(InstanceOperator):
     """A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location.
     Attributes:

standard.py CHANGED Viewed

@@ -124,11 +124,23 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
     def set_pipelines(self):
         self.loading = SequentialOperator()
         self.metadata = SequentialOperator()
         self.standardization = SequentialOperator()
         self.processing = SequentialOperator()
         self.verblization = SequentialOperator()
         self.finalize = SequentialOperator()
         self.steps = [
             self.loading,
@@ -211,7 +223,6 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
             AddFields(
                 fields={
                     "recipe_metadata": {
-                        "card": self.card,
                         "template": self.template,
                         "system_prompt": self.system_prompt,
                         "format": self.format,
@@ -228,7 +239,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
             self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
             self.processing.steps.append(self.augmentor)
-        if self.demos_pool_size is not None:
             self.processing.steps.append(
                 CreateDemosPool(
                     from_split=self.demos_taken_from,

     def set_pipelines(self):
         self.loading = SequentialOperator()
+        self.loading.__description__ = "Loading the data from the data source."
         self.metadata = SequentialOperator()
+        self.metadata.__description__ = (
+            "Adding metadata (e.g. format, system prompt, template)  "
+        )
         self.standardization = SequentialOperator()
+        self.standardization.__description__ = (
+            "Standardizing the raw dataset fields to task field definition."
+        )
         self.processing = SequentialOperator()
+        self.processing.__description__ = (
+            "Setting task fields (and selecting demos per sample if needed)."
+        )
         self.verblization = SequentialOperator()
+        self.verblization.__description__ = "Verbalizing the input to the model and gold references to the 'source', 'target' and 'references' fields."
         self.finalize = SequentialOperator()
+        self.finalize.__description__ = "Adding post processors. Removing intermediate fields. Creating the final output dataset."
         self.steps = [
             self.loading,
             AddFields(
                 fields={
                     "recipe_metadata": {
                         "template": self.template,
                         "system_prompt": self.system_prompt,
                         "format": self.format,
             self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
             self.processing.steps.append(self.augmentor)
+        if self.demos_pool_size is not None and self.demos_pool_size > 0:
             self.processing.steps.append(
                 CreateDemosPool(
                     from_split=self.demos_taken_from,

string_operators.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import (
     Optional,
 )
-from .operators import FieldOperator, StreamInstanceOperator
 class Split(FieldOperator):
@@ -44,7 +44,7 @@ class Join(FieldOperator):
         return self.by.join(value)
-class FormatText(StreamInstanceOperator):
     to_field: str
     text: str

     Optional,
 )
+from .operators import FieldOperator, InstanceOperator
 class Split(FieldOperator):
         return self.by.join(value)
+class FormatText(InstanceOperator):
     to_field: str
     text: str

struct_data_operators.py CHANGED Viewed

@@ -28,7 +28,7 @@ from typing import (
 import pandas as pd
 from .dict_utils import dict_get
-from .operators import FieldOperator, StreamInstanceOperator
 class SerializeTable(ABC, FieldOperator):
@@ -237,7 +237,7 @@ def truncate_cell(cell_value, max_len):
     return None
-class TruncateTableCells(StreamInstanceOperator):
     """Limit the maximum length of cell values in a table to reduce the overall length.
     Args:
@@ -318,7 +318,7 @@ class TruncateTableRows(FieldOperator):
         return table_content
-class SerializeTableRowAsText(StreamInstanceOperator):
     """Serializes a table row as text.
     Args:
@@ -348,7 +348,7 @@ class SerializeTableRowAsText(StreamInstanceOperator):
         return instance
-class SerializeTableRowAsList(StreamInstanceOperator):
     """Serializes a table row as list.
     Args:
@@ -417,7 +417,7 @@ class SerializeKeyValPairs(FieldOperator):
         return serialized_str[:-2]
-class ListToKeyValPairs(StreamInstanceOperator):
     """Maps list of keys and values into key:value pairs.
     Sample input in expected format: {"keys": ["name", "age", "sex"], "values": ["Alex", 31, "M"]}
@@ -512,16 +512,16 @@ class ShuffleTableColumns(FieldOperator):
     """Shuffles the table columns randomly.
     Sample Input:
-    {
-        "header": ["name", "age"],
-        "rows": [["Alex", 26], ["Raj", 34], ["Donald", 39]],
-    }
     Sample Output:
-    {
-        "header": ["age", "name"],
-        "rows": [[26, "Alex"], [34, "Raj"], [39, "Donald"]],
-    }
     """
     def process_value(self, table: Any) -> Any:

 import pandas as pd
 from .dict_utils import dict_get
+from .operators import FieldOperator, InstanceOperator
 class SerializeTable(ABC, FieldOperator):
     return None
+class TruncateTableCells(InstanceOperator):
     """Limit the maximum length of cell values in a table to reduce the overall length.
     Args:
         return table_content
+class SerializeTableRowAsText(InstanceOperator):
     """Serializes a table row as text.
     Args:
         return instance
+class SerializeTableRowAsList(InstanceOperator):
     """Serializes a table row as list.
     Args:
         return serialized_str[:-2]
+class ListToKeyValPairs(InstanceOperator):
     """Maps list of keys and values into key:value pairs.
     Sample input in expected format: {"keys": ["name", "age", "sex"], "values": ["Alex", 31, "M"]}
     """Shuffles the table columns randomly.
     Sample Input:
+        {
+            "header": ["name", "age"],
+            "rows": [["Alex", 26], ["Raj", 34], ["Donald", 39]],
+        }
     Sample Output:
+        {
+            "header": ["age", "name"],
+            "rows": [[26, "Alex"], [34, "Raj"], [39, "Donald"]],
+        }
     """
     def process_value(self, table: Any) -> Any:

system_prompts.py CHANGED Viewed

@@ -2,10 +2,10 @@ from abc import abstractmethod
 from typing import Any, Dict, Optional
 from .dataclass import NonPositionalField
-from .operator import StreamInstanceOperator
-class SystemPrompt(StreamInstanceOperator):
     """The role of SystemPrompt is to add task-independent opening-text to every instance."""
     skip_rendered_instance: bool = NonPositionalField(default=True)

 from typing import Any, Dict, Optional
 from .dataclass import NonPositionalField
+from .operator import InstanceOperator
+class SystemPrompt(InstanceOperator):
     """The role of SystemPrompt is to add task-independent opening-text to every instance."""
     skip_rendered_instance: bool = NonPositionalField(default=True)

task.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Union
 from .artifact import fetch_artifact
 from .logging_utils import get_logger
-from .operator import StreamInstanceOperator
 from .type_utils import (
     get_args,
     get_origin,
@@ -13,8 +13,8 @@ from .type_utils import (
 )
-class Task(StreamInstanceOperator):
-    """FormTask packs the different instance fields into dictionaries by their roles in the task.
     Attributes:
         inputs (Union[Dict[str, str], List[str]]):
@@ -81,7 +81,7 @@ class Task(StreamInstanceOperator):
     def check_metrics_type(self) -> None:
         prediction_type = parse_type_string(self.prediction_type)
         for metric_id in self.metrics:
-            metric_prediction_type = FormTask.get_metric_prediction_type(metric_id)
             if (
                 prediction_type == metric_prediction_type
@@ -107,11 +107,13 @@ class Task(StreamInstanceOperator):
         inputs = {key: instance[key] for key in self.inputs.keys()}
         outputs = {key: instance[key] for key in self.outputs.keys()}
         return {
             "inputs": inputs,
             "outputs": outputs,
             "metrics": self.metrics,
         }

 from .artifact import fetch_artifact
 from .logging_utils import get_logger
+from .operator import InstanceOperator
 from .type_utils import (
     get_args,
     get_origin,
 )
+class Task(InstanceOperator):
+    """Task packs the different instance fields into dictionaries by their roles in the task.
     Attributes:
         inputs (Union[Dict[str, str], List[str]]):
     def check_metrics_type(self) -> None:
         prediction_type = parse_type_string(self.prediction_type)
         for metric_id in self.metrics:
+            metric_prediction_type = Task.get_metric_prediction_type(metric_id)
             if (
                 prediction_type == metric_prediction_type
         inputs = {key: instance[key] for key in self.inputs.keys()}
         outputs = {key: instance[key] for key in self.outputs.keys()}
+        data_classification_policy = instance.get("data_classification_policy", [])
         return {
             "inputs": inputs,
             "outputs": outputs,
             "metrics": self.metrics,
+            "data_classification_policy": data_classification_policy,
         }

templates.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from .artifact import Artifact
 from .collections import ListCollection
 from .dataclass import NonPositionalField
-from .operator import StreamInstanceOperator
 from .random_utils import new_random_generator
 from .type_utils import isoftype
@@ -20,7 +20,7 @@ class TemplateFormatKeyError(KeyError):
         )
-class Template(StreamInstanceOperator):
     """The role of template is to take the fields of every instance and verbalize it.
     Meaning the template is taking the instance and generating source, target and references.

 from .artifact import Artifact
 from .collections import ListCollection
 from .dataclass import NonPositionalField
+from .operator import InstanceOperator
 from .random_utils import new_random_generator
 from .type_utils import isoftype
         )
+class Template(InstanceOperator):
     """The role of template is to take the fields of every instance and verbalize it.
     Meaning the template is taking the instance and generating source, target and references.

text_utils.py CHANGED Viewed

@@ -89,6 +89,9 @@ def construct_dict_str(d, indent=0, indent_delta=4, max_chars=None):
             res += construct_dict_str(value, indent + indent_delta, max_chars=max_chars)
         else:
             str_value = str(value)
             line_width = max_chars - indent
             lines = str_value.split("\n")
             res += f"{indent_str}{key} ({type(value).__name__}):\n"

             res += construct_dict_str(value, indent + indent_delta, max_chars=max_chars)
         else:
             str_value = str(value)
+            str_value = re.sub(r"\w+=None, ", "", str_value)
+            str_value = re.sub(r"\w+={}, ", "", str_value)
+            str_value = re.sub(r"\w+=\[\], ", "", str_value)
             line_width = max_chars - indent
             lines = str_value.split("\n")
             res += f"{indent_str}{key} ({type(value).__name__}):\n"

type_utils.py CHANGED Viewed

@@ -13,23 +13,23 @@ def convert_union_type(type_string: str) -> str:
     Args:
         type_string (str): A string representation of a Python type hint. It can be any
-                           valid Python type, which does not contain strings (e.g. 'Literal').
-                           Examples include 'List[int|float]', 'str|float|bool' etc.
-        Formally, the function depends on the input string adhering to the following rules.
-        Assuming that the input is a valid type hint the function does not check that 'word' is
-        'str', 'bool', 'List' etc. It just depends on the following general structure (spaces ignored):
-        type -> word OR type( | type)* OR word[type( , type)*]
-        word is a sequence of (0 or more) chars, each being any char but: [ ] , |
-        This implies that if any of these 4 chars shows not as a meta char of the input
-        type_string, but inside some constant string (of Literal, for example), the scheme
-        will not work.
-        Cases like Literal, that might contain occurrences of the four chars above not as meta chars
-        in the type string, must be handled as special cases by this function, as shown for Literal,
-        as an example. Because 'format_type_string' serves as preprocessing for 'parse_type_string',
-        which has a list of allowed types, of which Literal is not a member, Literal and such are not
-        relevant at all now; and the case is brought here just for an example for future use.
     Returns:

     Args:
         type_string (str): A string representation of a Python type hint. It can be any
+            valid Python type, which does not contain strings (e.g. 'Literal').
+            Examples include 'List[int|float]', 'str|float|bool' etc.
+            Formally, the function depends on the input string adhering to the following rules.
+            Assuming that the input is a valid type hint the function does not check that 'word' is
+            'str', 'bool', 'List' etc. It just depends on the following general structure (spaces ignored):
+            type -> word OR type( | type)* OR word[type( , type)*]
+            word is a sequence of (0 or more) chars, each being any char but: [ ] , |
+            This implies that if any of these 4 chars shows not as a meta char of the input
+            type_string, but inside some constant string (of Literal, for example), the scheme
+            will not work.
+            Cases like Literal, that might contain occurrences of the four chars above not as meta chars
+            in the type string, must be handled as special cases by this function, as shown for Literal,
+            as an example. Because 'format_type_string' serves as preprocessing for 'parse_type_string',
+            which has a list of allowed types, of which Literal is not a member, Literal and such are not
+            relevant at all now; and the case is brought here just for an example for future use.
     Returns:

validate.py CHANGED Viewed

@@ -4,14 +4,14 @@ from typing import Any, Dict, Optional
 from datasets import Features, Sequence, Value
-from .operator import StreamInstanceOperator
 class Validator(ABC):
     pass
-class ValidateSchema(Validator, StreamInstanceOperator):
     schema: Features = None
     def verify(self):

 from datasets import Features, Sequence, Value
+from .operator import InstanceOperator
 class Validator(ABC):
     pass
+class ValidateSchema(Validator, InstanceOperator):
     schema: Features = None
     def verify(self):

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.9.0"


1	+ version = "1.10.0"