Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on May 20, 2024

Commit

b462f85

verified ·

1 Parent(s): ff375eb

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

artifact.py +4 -3
blocks.py +1 -1
card.py +2 -2
formats.py +2 -2
fusion.py +95 -43
inference.py +78 -19
llm_as_judge.py +111 -31
loaders.py +33 -6
metric_utils.py +103 -44
metrics.py +76 -22
operator.py +6 -4
operators.py +97 -6
processors.py +22 -9
schema.py +14 -6
settings_utils.py +2 -1
splitters.py +6 -3
standard.py +3 -0
stream.py +36 -5
string_operators.py +18 -2
struct_data_operators.py +19 -0
task.py +3 -51
templates.py +142 -11
text_utils.py +40 -0
version.py +1 -1

artifact.py CHANGED Viewed

@@ -248,8 +248,9 @@ class Artifact(Dataclass):
                 value = map_values_in_place(value, maybe_recover_artifact)
                 setattr(self, field.name, value)
-        self.prepare()
-        self.verify()
     def _to_raw_dict(self):
         return {"type": self.type, **self._init_dict}
@@ -335,7 +336,7 @@ def get_artifactory_name_and_args(
 def verbosed_fetch_artifact(identifier):
     artifact, artifactory = fetch_artifact(identifier)
-    logger.info(f"Artifact {identifier} is fetched from {artifactory}")
     return artifact

                 value = map_values_in_place(value, maybe_recover_artifact)
                 setattr(self, field.name, value)
+        if not settings.skip_artifacts_prepare_and_verify:
+            self.prepare()
+            self.verify()
     def _to_raw_dict(self):
         return {"type": self.type, **self._init_dict}
 def verbosed_fetch_artifact(identifier):
     artifact, artifactory = fetch_artifact(identifier)
+    logger.debug(f"Artifact {identifier} is fetched from {artifactory}")
     return artifact

blocks.py CHANGED Viewed

@@ -31,7 +31,7 @@ from .struct_data_operators import (
     TruncateTableCells,
     TruncateTableRows,
 )
-from .task import FormTask
 from .templates import (
     InputOutputTemplate,
     MultiLabelTemplate,

     TruncateTableCells,
     TruncateTableRows,
 )
+from .task import Task
 from .templates import (
     InputOutputTemplate,
     MultiLabelTemplate,

card.py CHANGED Viewed

@@ -6,7 +6,7 @@ from .dataclass import OptionalField
 from .loaders import Loader
 from .operator import StreamingOperator
 from .splitters import RandomSampler, Sampler
-from .task import FormTask
 class TaskCard(Artifact):
@@ -24,6 +24,6 @@ class TaskCard(Artifact):
     loader: Loader
     preprocess_steps: List[StreamingOperator] = None
-    task: FormTask
     templates: Collection = None
     sampler: Sampler = OptionalField(default_factory=RandomSampler)

 from .loaders import Loader
 from .operator import StreamingOperator
 from .splitters import RandomSampler, Sampler
+from .task import Task
 class TaskCard(Artifact):
     loader: Loader
     preprocess_steps: List[StreamingOperator] = None
+    task: Task
     templates: Collection = None
     sampler: Sampler = OptionalField(default_factory=RandomSampler)

formats.py CHANGED Viewed

@@ -114,9 +114,9 @@ class SystemFormat(Format):
     """
     demos_field: str = "demos"
-    demo_format: str = "{source}\n{target_prefix}{target}\n\n"  #  example: "User: {source}\nAgent: {target}\n\n"
     model_input_format: str = (
-        "{system_prompt}{instruction}{demos}{source}\n{target_prefix}"
     )
     format_args: Dict[str, str] = OptionalField(default_factory=dict)

     """
     demos_field: str = "demos"
+    demo_format: str = "{source}\\N{target_prefix}{target}\n\n"  #  example: "User: {source}\nAgent: {target}\n\n"
     model_input_format: str = (
+        "{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}"
     )
     format_args: Dict[str, str] = OptionalField(default_factory=dict)

fusion.py CHANGED Viewed

@@ -1,31 +1,44 @@
-import copy
 from abc import abstractmethod
-from typing import Generator, List, Optional
 from .dataclass import NonPositionalField
 from .operator import SourceOperator
 from .random_utils import new_random_generator
-from .stream import MultiStream, Stream
 class BaseFusion(SourceOperator):
-    """BaseFusion operator that combines multiple streams into one.
     Args:
-        include_splits: List of splits to include. If None, all splits are included.
     """
-    origins: List[SourceOperator]
     include_splits: Optional[List[str]] = NonPositionalField(default=None)
     @abstractmethod
     def fusion_generator(self, split) -> Generator:
         pass
-    def splits(self) -> Generator:
         splits = []
-        for origin in self.origins:
-            for s in origin().keys():
                 if s not in splits:
                     if self.include_splits is None or s in self.include_splits:
                         splits.append(s)
@@ -36,48 +49,62 @@ class BaseFusion(SourceOperator):
     ) -> MultiStream:
         result = {}
         for split in self.splits():
-            result[split] = Stream(self.fusion_generator, gen_kwargs={"split": split})
         return MultiStream(result)
 class FixedFusion(BaseFusion):
-    """FixedFusion operator that combines multiple streams into one based on a fixed number of examples per task.
     Args:
-        origins: List of SourceOperator objects.
-        examples_per_task: Number of examples per task. If None, all examples are returned.
-        splits: List of splits to include. If None, all splits are included.
     """
-    max_instances_per_origin: Optional[int] = None
     def fusion_generator(self, split) -> Generator:
-        for origin in self.origins:
-            multi_stream = origin()
-            if split not in multi_stream:
                 continue
-            iterator = iter(multi_stream[split])
-            if self.max_instances_per_origin is not None:
-                for _ in range(self.max_instances_per_origin):
-                    try:
-                        yield next(iterator)
-                    except StopIteration:
-                        break
-            else:
-                yield from iterator
 class WeightedFusion(BaseFusion):
-    """Fusion operator that combines multiple streams based.
     Args:
-        origins: List of SourceOperator objects.
-        weights: List of weights for each origin.
-        max_total_examples: Total number of examples to return. If None, all examples are returned.
     """
-    origins: List[SourceOperator] = None
-    weights: List[float] = None
     max_total_examples: int = None
     def verify(self):
@@ -87,22 +114,47 @@ class WeightedFusion(BaseFusion):
         assert len(self.origins) == len(
             self.weights
         ), "origins and weights must have the same length"
     def fusion_generator(self, split) -> Generator:
-        weights = copy.deepcopy(self.weights)
-        iterators = [iter(origin()[split]) for origin in self.origins]
         total_examples = 0
         random_generator = new_random_generator(sub_seed="weighted_fusion_" + split)
         while (
-            self.max_total_examples is None or total_examples <= self.max_total_examples
         ) and len(iterators) > 0:
-            iterator = random_generator.choices(population=iterators, weights=weights)[
-                0
-            ]
             try:
-                yield next(iterator)
                 total_examples += 1
             except StopIteration:
-                index = iterators.index(iterator)
-                iterators.pop(index)
-                weights.pop(index)

 from abc import abstractmethod
+from typing import Dict, Generator, List, Optional, Union
 from .dataclass import NonPositionalField
 from .operator import SourceOperator
 from .random_utils import new_random_generator
+from .stream import GeneratorStream, MultiStream
+from .type_utils import isoftype
 class BaseFusion(SourceOperator):
+    """BaseFusion operator that combines multiple multistreams into one.
     Args:
+        origins: a dict of named SourceOperator objects (each to yield a MultiStream) or a list thereof,
+          each is specified along with its input, so can generate a MultiStream
+        include_splits: List of splits to include from each input MultiStream.
+                If None, all splits are included.
     """
+    origins: Union[List[SourceOperator], Dict[str, SourceOperator]]
     include_splits: Optional[List[str]] = NonPositionalField(default=None)
     @abstractmethod
     def fusion_generator(self, split) -> Generator:
         pass
+    def prepare(self):
+        assert isoftype(self.origins, Dict[str, SourceOperator]) or isoftype(
+            self.origins, List[SourceOperator]
+        )
+        self.named_origins = (
+            {i: self.origins[i]() for i in range(len(self.origins))}
+            if isinstance(self.origins, list)
+            else {name: origin() for name, origin in self.origins.items()}
+        )
+    def splits(self) -> List[str]:
         splits = []
+        for _, origin in self.named_origins.items():
+            for s in origin.keys():
                 if s not in splits:
                     if self.include_splits is None or s in self.include_splits:
                         splits.append(s)
     ) -> MultiStream:
         result = {}
         for split in self.splits():
+            result[split] = GeneratorStream(
+                self.fusion_generator, gen_kwargs={"split": split}
+            )
         return MultiStream(result)
 class FixedFusion(BaseFusion):
+    """FixedFusion operator that combines multiple multistreams into one, limiting the number of instances taken from each split of each input multistream.
     Args:
+        origins: Dict of named SourceOperator objects (each to yield a MultiStream), or a list thereof
+        splits: List of splits (stream_names) to include, over all input multistreams. If None, all splits are included.
+        max_instances_per_origin_split: Number of instances to take from each input split of each input multistream.
+            If None, all instances of each split (that is specified in include_splits) are included in the result.
     """
+    max_instances_per_origin_split: Optional[int] = None
+    def prepare(self):
+        super().prepare()
+    # flake8: noqa: C901
     def fusion_generator(self, split) -> Generator:
+        for origin_name, origin in self.named_origins.items():
+            if split not in origin:
                 continue
+            emitted_from_this_split = 0
+            for instance in origin[split]:
+                if (
+                    self.max_instances_per_origin_split is not None
+                    and emitted_from_this_split >= self.max_instances_per_origin_split
+                ):
+                    break
+                if isinstance(origin_name, str):
+                    # named origins, not anonymous, record in instance
+                    if "group" in instance:
+                        instance["group"] = origin_name + "/" + instance["group"]
+                    else:
+                        instance["group"] = origin_name
+                emitted_from_this_split += 1
+                yield instance
 class WeightedFusion(BaseFusion):
+    """Fusion operator that combines multiple MultiStream-s.
     Args:
+        origins: Dict of named MultiStream objects, or a list thereof
+        weights: Dict of named weights for each origin, or a list thereof
+        max_total_examples: Total number of instances to return per returned split.
+            If None, all instances are returned
     """
+    origins: Union[Dict[str, MultiStream], List[MultiStream]] = None
+    weights: Union[Dict[str, Union[float, int]], List[Union[int, float]]] = None
     max_total_examples: int = None
     def verify(self):
         assert len(self.origins) == len(
             self.weights
         ), "origins and weights must have the same length"
+        assert isoftype(self.origins, Dict[str, SourceOperator]) or isoftype(
+            self.origins, List[SourceOperator]
+        )
+        assert isoftype(self.weights, Dict[str, Union[int, float]]) or isoftype(
+            self.weights, List[Union[int, float]]
+        )
+        assert isinstance(self.origins, dict) == isinstance(self.weights, dict)
+    def prepare(self):
+        super().prepare()
+        self.named_weights = (
+            {i: float(self.weights[i]) for i in range(len(self.weights))}
+            if isinstance(self.weights, list)
+            else {k: float(v) for (k, v) in self.weights.items()}
+        )
     def fusion_generator(self, split) -> Generator:
+        iterators = {
+            named_origin: iter(origin[split])
+            for named_origin, origin in self.named_origins.items()
+        }
         total_examples = 0
         random_generator = new_random_generator(sub_seed="weighted_fusion_" + split)
         while (
+            self.max_total_examples is None or total_examples < self.max_total_examples
         ) and len(iterators) > 0:
+            population = list(iterators.keys())
+            origin_name = random_generator.choices(
+                population=population,
+                weights=[self.named_weights[name] for name in population],
+            )[0]
+            iterator = iterators[origin_name]
             try:
+                instance = next(iterator)
+                if isinstance(origin_name, str):
+                    if "group" in instance:
+                        instance["group"] = origin_name + "/" + instance["group"]
+                    else:
+                        instance["group"] = origin_name
                 total_examples += 1
+                yield instance
             except StopIteration:
+                iterators.pop(origin_name)

inference.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import abc
 import os
-from dataclasses import dataclass
-from typing import List, Optional, Union
 from .artifact import Artifact
 from .operator import PackageRequirementsMixin
@@ -28,28 +28,72 @@ class InferenceEngine(abc.ABC, Artifact):
 class HFPipelineBasedInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     model_name: str
     max_new_tokens: int
     _requirement = {
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
     def prepare(self):
-        from transformers import pipeline
-        self.model = pipeline(model=self.model_name)
     def infer(self, dataset):
-        return [
-            output["generated_text"]
-            for output in self.model(
-                [instance["source"] for instance in dataset],
-                max_new_tokens=self.max_new_tokens,
-            )
-        ]
-@dataclass()
-class IbmGenAiInferenceEngineParams:
-    decoding_method: str = None
     max_new_tokens: Optional[int] = None
     min_new_tokens: Optional[int] = None
     random_seed: Optional[int] = None
@@ -64,7 +108,9 @@ class IbmGenAiInferenceEngineParams:
 class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     label: str = "ibm_genai"
     model_name: str
-    parameters: IbmGenAiInferenceEngineParams = IbmGenAiInferenceEngineParams()
     _requirement = {
         "genai": "Install ibm-genai package using 'pip install --upgrade ibm-generative-ai"
     }
@@ -87,7 +133,19 @@ class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     def infer(self, dataset):
         from genai.schema import TextGenerationParameters
-        genai_params = TextGenerationParameters(**self.parameters.__dict__)
         return list(
             self.client.text.generation.create(
                 model_id=self.model_name,
@@ -97,8 +155,7 @@ class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
         )
-@dataclass
-class OpenAiInferenceEngineParams:
     frequency_penalty: Optional[float] = None
     presence_penalty: Optional[float] = None
     max_tokens: Optional[int] = None
@@ -111,7 +168,9 @@ class OpenAiInferenceEngineParams:
 class OpenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     label: str = "openai"
     model_name: str
-    parameters: OpenAiInferenceEngineParams = OpenAiInferenceEngineParams()
     _requirement = {
         "openai": "Install openai package using 'pip install --upgrade openai"
     }

 import abc
 import os
+from dataclasses import field
+from typing import Any, Dict, List, Literal, Optional, Union
 from .artifact import Artifact
 from .operator import PackageRequirementsMixin
 class HFPipelineBasedInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     model_name: str
     max_new_tokens: int
+    use_fp16: bool = True
     _requirement = {
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
     def prepare(self):
+        import torch
+        from transformers import AutoConfig, pipeline
+        model_args: Dict[str, Any] = (
+            {"torch_dtype": torch.float16} if self.use_fp16 else {}
+        )
+        model_args.update({"max_new_tokens": self.max_new_tokens})
+        device = torch.device(
+            "mps"
+            if torch.backends.mps.is_available()
+            else 0
+            if torch.cuda.is_available()
+            else "cpu"
+        )
+        # We do this, because in some cases, using device:auto will offload some weights to the cpu
+        # (even though the model might *just* fit to a single gpu), even if there is a gpu available, and this will
+        # cause an error because the data is always on the gpu
+        if torch.cuda.device_count() > 1:
+            assert device == torch.device(0)
+            model_args.update({"device_map": "auto"})
+        else:
+            model_args.update({"device": device})
+        task = (
+            "text2text-generation"
+            if AutoConfig.from_pretrained(
+                self.model_name, trust_remote_code=True
+            ).is_encoder_decoder
+            else "text-generation"
+        )
+        if task == "text-generation":
+            model_args.update({"return_full_text": False})
+        self.model = pipeline(
+            model=self.model_name, trust_remote_code=True, **model_args
+        )
     def infer(self, dataset):
+        outputs = []
+        for output in self.model([instance["source"] for instance in dataset]):
+            if isinstance(output, list):
+                output = output[0]
+            outputs.append(output["generated_text"])
+        return outputs
+class MockInferenceEngine(InferenceEngine):
+    model_name: str
+    def prepare(self):
+        return
+    def infer(self, dataset):
+        return ["[[10]]" for instance in dataset]
+class IbmGenAiInferenceEngineParams(Artifact):
+    decoding_method: Optional[Literal["greedy", "sample"]] = None
     max_new_tokens: Optional[int] = None
     min_new_tokens: Optional[int] = None
     random_seed: Optional[int] = None
 class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     label: str = "ibm_genai"
     model_name: str
+    parameters: IbmGenAiInferenceEngineParams = field(
+        default_factory=IbmGenAiInferenceEngineParams
+    )
     _requirement = {
         "genai": "Install ibm-genai package using 'pip install --upgrade ibm-generative-ai"
     }
     def infer(self, dataset):
         from genai.schema import TextGenerationParameters
+        genai_params = TextGenerationParameters(
+            max_new_tokens=self.parameters.max_new_tokens,
+            min_new_tokens=self.parameters.min_new_tokens,
+            random_seed=self.parameters.random_seed,
+            repetition_penalty=self.parameters.repetition_penalty,
+            stop_sequences=self.parameters.stop_sequences,
+            temperature=self.parameters.temperature,
+            top_p=self.parameters.top_p,
+            top_k=self.parameters.top_k,
+            typical_p=self.parameters.typical_p,
+            decoding_method=self.parameters.decoding_method,
+        )
         return list(
             self.client.text.generation.create(
                 model_id=self.model_name,
         )
+class OpenAiInferenceEngineParams(Artifact):
     frequency_penalty: Optional[float] = None
     presence_penalty: Optional[float] = None
     max_tokens: Optional[int] = None
 class OpenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     label: str = "openai"
     model_name: str
+    parameters: OpenAiInferenceEngineParams = field(
+        default_factory=OpenAiInferenceEngineParams
+    )
     _requirement = {
         "openai": "Install openai package using 'pip install --upgrade openai"
     }

llm_as_judge.py CHANGED Viewed

@@ -1,58 +1,138 @@
-from typing import Any, Dict, List
-import evaluate
-from .api import produce
-from .inference import InferenceEngine
 from .metrics import BulkInstanceMetric
 class LLMAsJudge(BulkInstanceMetric):
     """LLM as judge based metric class for evaluating correctness.
     Attributes:
-        main_score (str): The main score used for evaluation.
         reduction_map (dict): A dictionary specifying the reduction method for the metric.
-        betch_size (int): The size of the bulk.
-        recipe (str): The unitxt recipe that will be used to create the judge dataset.
-        inference (InferenceEngine): the module that creates the inference.
-    Methods:
-        prepare(self): Initialization method for the metric.
-        compute(self, references, predictions, additional_inputs): Method to compute the metric.
-    Usage:
-        metric = LlamaIndexCorrectnessMetric()
-        scores = metric.compute(references, prediction, additional_inputs)
     """
     main_score: str = "llm_as_judge"
-    reduction_map: Dict[str, List[str]] = None
-    batch_size: int = 32
-    recipe: str
     inference_model: InferenceEngine
     def prepare(self):
         super().prepare()
         if self.reduction_map is None:
             self.reduction_map = {"mean": [self.main_score]}
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
         task_data: List[Dict],
     ) -> List[Dict[str, Any]]:
-        instances = [
-            {
-                **task_data_instance,
-                **{"model_output": prediction, "rating_label": "[[5]]"},
-            }
-            for task_data_instance, prediction in zip(task_data, predictions)
-        ]
-        dataset = produce(instances, self.recipe)
         verdicts = self.inference_model.infer(dataset)
-        meta_metric = evaluate.load("unitxt/metric")
-        meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
         return [{self.main_score: instance["prediction"]} for instance in meta_scores]

+from typing import Any, Dict, List, Literal, Optional
+from .api import evaluate, produce
+from .inference import InferenceEngine, OpenAiInferenceEngine
 from .metrics import BulkInstanceMetric
+from .operator import SequentialOperator
 class LLMAsJudge(BulkInstanceMetric):
     """LLM as judge based metric class for evaluating correctness.
     Attributes:
+        main_score (str): The main score label used for evaluation.
+        task (Literal["rating.single_turn"]): The type of task the llm-as-judge runs. This defines the output and input
+         format of the jude model.
+        template (str): The template used when generating inputs for the judge llm.
+        format (str): The format used when generating inputs for judge llm.
+        system_prompt (str): The system prompt used when generating inputs for judge llm.
+        strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the
+         inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt.
+        inference_model (InferenceEngine): the module that creates the inference of the judge llm.
         reduction_map (dict): A dictionary specifying the reduction method for the metric.
+        batch_size (int): The size of the bulk.
     """
     main_score: str = "llm_as_judge"
+    task: Literal["rating.single_turn", "single_turn_with_reference"]
+    template: str
+    format: Optional[str] = None
+    system_prompt: Optional[str] = None
+    strip_system_prompt_and_format_from_inputs: bool = True
     inference_model: InferenceEngine
+    reduction_map: Optional[Dict[str, List[str]]] = None
+    batch_size: int = 32
+    def _get_input_instances(self, task_data: List[Dict]) -> List:
+        if self.strip_system_prompt_and_format_from_inputs:
+            instances = []
+            for task_data_instance in task_data:
+                template = task_data_instance["metadata"]["template"]
+                instance = SequentialOperator(
+                    steps=[template, "formats.empty"]
+                ).process_instance(
+                    {"inputs": task_data_instance, "outputs": task_data_instance}
+                )
+                instances.append(instance["source"])
+                """
+                We also have access to: instance["target"]
+                                        instance["references"]
+                """
+            return instances
+        return [t["source"] for t in task_data]
+    def _get_instance_for_judge_model(
+        self, input_instances: List[str], predictions: List, references: List
+    ) -> List[Dict]:
+        if self.task == "rating.single_turn":
+            instances = [
+                {
+                    "question": input_instance,
+                    "answer": prediction,
+                    "rating": 5.0,  # This is a dummy value that is not used in practice
+                }
+                for input_instance, prediction, reference in zip(
+                    input_instances, predictions, references
+                )
+            ]
+        elif self.task == "rating.single_turn_with_reference":
+            instances = [
+                {
+                    "question": input_instance,
+                    "answer": prediction,
+                    "reference_answer": reference,
+                    "rating": 5.0,  # This is a dummy value that is not used in practice
+                }
+                for input_instance, prediction, reference in zip(
+                    input_instances, predictions, references
+                )
+            ]
+        else:
+            raise NotImplementedError(
+                f"Error in 'LLMAsJudge' metric. {self.task} is not a supported task type."
+            )
+        return instances
     def prepare(self):
         super().prepare()
         if self.reduction_map is None:
             self.reduction_map = {"mean": [self.main_score]}
+        supported_tasks = ["rating.single_turn", "rating.single_turn_with_reference"]
+        assert self.task in supported_tasks, (
+            f"Error in 'LLMAsJudge' metric. {self.task} is not a supported task type."
+            f"The supported tasks types are: {', '.join(supported_tasks)}."
+        )
+        if isinstance(self.inference_model, OpenAiInferenceEngine):
+            if self.format:
+                raise ValueError(
+                    "Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
+                    "not support formatting. Please remove the format definition from the recipe"
+                    " (OpenAi Chat API take care of the formatting automatically)."
+                )
+            if self.system_prompt:
+                raise ValueError(
+                    "Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
+                    "not support system prompt. Please remove the system_prompt definition from the recipe"
+                    " (Current implementation of Unitxt does not support this."
+                    " Support will be added in future updates)."
+                )
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
         task_data: List[Dict],
     ) -> List[Dict[str, Any]]:
+        input_instances = self._get_input_instances(task_data)
+        instances = self._get_instance_for_judge_model(
+            input_instances, predictions, references
+        )
+        card = f"cards.dynamic_cards_for_llm_judges.{self.task}"
+        recipe = (
+            f"card={card},"
+            f"template={self.template},"
+            "demos_pool_size=0,"
+            "num_demos=0"
+        )
+        if self.system_prompt:
+            recipe = f"{recipe},system_prompt={self.system_prompt}"
+        if self.format:
+            recipe = f"{recipe},format={self.format}"
+        dataset = produce(instances, recipe)
         verdicts = self.inference_model.infer(dataset)
+        meta_scores = evaluate(predictions=verdicts, data=dataset)
         return [{self.main_score: instance["prediction"]} for instance in meta_scores]

loaders.py CHANGED Viewed

@@ -27,7 +27,7 @@ import os
 import tempfile
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, List, Mapping, Optional, Sequence, Union
 import pandas as pd
 from datasets import load_dataset as hf_load_dataset
@@ -38,7 +38,7 @@ from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .settings_utils import get_settings
-from .stream import MultiStream, Stream
 logger = get_logger()
 settings = get_settings()
@@ -180,7 +180,7 @@ class LoadHF(Loader):
         self.log_limited_loading()
         return MultiStream(
             {
-                name: Stream(
                     generator=self.split_limited_load, gen_kwargs={"split_name": name}
                 )
                 for name in self._cache.keys()
@@ -240,14 +240,18 @@ class LoadCSV(Loader):
         if self.streaming:
             return MultiStream(
                 {
-                    name: Stream(generator=self.stream_csv, gen_kwargs={"file": file})
                     for name, file in self.files.items()
                 }
             )
         return MultiStream(
             {
-                name: Stream(generator=self.load_csv, gen_kwargs={"file": file})
                 for name, file in self.files.items()
             }
         )
@@ -472,5 +476,28 @@ class MultipleSourceLoader(Loader):
     def process(self):
         return FixedFusion(
-            origins=self.sources, max_instances_per_origin=self.get_limit()
         ).process()

 import tempfile
 from pathlib import Path
 from tempfile import TemporaryDirectory
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
 import pandas as pd
 from datasets import load_dataset as hf_load_dataset
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .settings_utils import get_settings
+from .stream import GeneratorStream, MultiStream
 logger = get_logger()
 settings = get_settings()
         self.log_limited_loading()
         return MultiStream(
             {
+                name: GeneratorStream(
                     generator=self.split_limited_load, gen_kwargs={"split_name": name}
                 )
                 for name in self._cache.keys()
         if self.streaming:
             return MultiStream(
                 {
+                    name: GeneratorStream(
+                        generator=self.stream_csv, gen_kwargs={"file": file}
+                    )
                     for name, file in self.files.items()
                 }
             )
         return MultiStream(
             {
+                name: GeneratorStream(
+                    generator=self.load_csv, gen_kwargs={"file": file}
+                )
                 for name, file in self.files.items()
             }
         )
     def process(self):
         return FixedFusion(
+            origins=self.sources, max_instances_per_origin_split=self.get_limit()
         ).process()
+class LoadFromDictionary(Loader):
+    """Allows loading data from dictionary of constants.
+    The loader can be used, for example, when debugging or working with small datasets.
+    Attributes:
+        data (Dict[str, List[Dict[str, Any]]]): a dictionary of constants from which the data will be loaded
+    Examples:
+        data = {
+            "train": {"input": "SomeInput1", "output": "SomeResult1"},
+            "test": {"input": "SomeInput2", "output": "SomeResult2"},
+        }
+        loader = LoadFromDictionary(data=data)
+        multi_stream = loader.process()
+    """
+    data: Dict[str, List[Dict[str, Any]]]
+    def process(self) -> MultiStream:
+        return MultiStream.from_iterables(self.data)

metric_utils.py CHANGED Viewed

@@ -1,71 +1,125 @@
 import json
-from typing import Any, Dict, Iterable, List, Optional
 from datasets import Features, Value
 from .dataclass import Dataclass
 from .operator import (
     MultiStreamOperator,
     SequentialOperatorInitializer,
     StreamInitializerOperator,
 )
 from .operators import (
-    Apply,
     ApplyMetric,
     ApplyOperatorsField,
     FlattenInstances,
     MergeStreams,
-    SplitByValue,
 )
 from .register import _reset_env_local_catalogs, register_all_artifacts
 from .schema import UNITXT_DATASET_SCHEMA
 from .settings_utils import get_settings
-from .stream import MultiStream, Stream
 class MultiStreamScoreMean(MultiStreamOperator):
-    def aggregate_results(self, multi_stream: MultiStream):
-        scores = []
-        for stream in multi_stream.values():
-            instance = stream.peek()
-            scores.append(instance["score"]["global"]["score"])
-        from statistics import mean
-        return mean(scores)
-    def spread_results(self, stream: Stream, score: float):
-        for instance in stream:
-            instance["score"]["global"]["groups_mean_score"] = score
-            yield instance
-    def spread_results_one_stream(self, stream: Stream):
-        for instance in stream:
-            instance["score"]["global"]["groups_mean_score"] = instance["score"][
-                "global"
-            ]["score"]
-            yield instance
     def process(self, multi_stream: MultiStream) -> MultiStream:
-        result = {}
-        # optimization in to avoid double calculation of metrics
-        # when aggregating results, if there is only one stream.
         if len(multi_stream) == 1:
-            for stream_name, stream in multi_stream.items():
-                result[stream_name] = Stream(
-                    self.spread_results_one_stream, gen_kwargs={"stream": stream}
-                )
-            return MultiStream(result)
-        mean_score = self.aggregate_results(multi_stream)
-        result = {}
         for stream_name, stream in multi_stream.items():
-            result[stream_name] = Stream(
-                self.spread_results, gen_kwargs={"stream": stream, "score": mean_score}
             )
-        return MultiStream(result)
 class FromPredictionsAndOriginalData(StreamInitializerOperator):
@@ -78,7 +132,7 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
     ) -> MultiStream:
         return MultiStream(
             {
-                split_name: Stream(
                     self.zip,
                     gen_kwargs={"predictions": predictions, "references": references},
                 )
@@ -94,20 +148,25 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
 class MetricRecipe(SequentialOperatorInitializer):
     calc_confidence_intervals: bool = True
     def prepare(self):
         register_all_artifacts()
         self.steps = [
             FromPredictionsAndOriginalData(),
-            Apply(
-                "task_data",
-                function="json.loads",
-                to_field="task_data",
             ),
             ApplyOperatorsField(
                 operators_field="postprocessors",
             ),
-            SplitByValue(["group"]),
             ApplyMetric(
                 "metrics",
                 calc_confidence_intervals=self.calc_confidence_intervals,

 import json
+from copy import deepcopy
+from typing import Any, Dict, Generator, Iterable, List, Optional
 from datasets import Features, Value
+from numpy import nanmean
 from .dataclass import Dataclass
+from .dict_utils import dict_set
 from .operator import (
     MultiStreamOperator,
     SequentialOperatorInitializer,
     StreamInitializerOperator,
 )
 from .operators import (
     ApplyMetric,
     ApplyOperatorsField,
+    CopyFields,
     FlattenInstances,
     MergeStreams,
+    SplitByNestedGroup,
 )
 from .register import _reset_env_local_catalogs, register_all_artifacts
 from .schema import UNITXT_DATASET_SCHEMA
 from .settings_utils import get_settings
+from .stream import GeneratorStream, MultiStream
+from .struct_data_operators import LoadJson
 class MultiStreamScoreMean(MultiStreamOperator):
+    """Given a multi-stream where each stream is already scored globally, generate a nested global score for the whole multi-stream.
+    The whole-ms-global-score is a nested structure, specifying (also) the individual global scores of the
+    individual streams participating in the input multi_stream.
+    The instances of all these individual streams are assumed to have the "group" field indicate the stream
+    they belong to.
+    Potentially, these individual streams were produced from a SplitByNestedGroup
+    operator that did not use the full length of the value in field "group" of the instances, but only the
+    first g components thereof, indicated by argument 'number_of_fusion_generations' of operator SplitByNestedGroup.
+    At any rate, a distinguishing prefix of the "group" value is recorded, by operator SplitByNestedGroup, in the stream_name.
+    The nested structure of the whole-ms-global-score is induced by these distinguishing prefixes,
+    by virtue of the global score of each individual stream sitting in the nested whole-ms-global-score,
+    deep in that dictionary, at the leaf lead to by a path being the distinguishing prefix indicated in the stream_name.
+    Thus, the global score of the stream becomes a leaf (though a dict by itself) of the whole-ms-global-score.
+    The ancestor nodes of the above leaves, in the whole-ms-global-score, contain each (in addition to dicts
+    leading down to leaves) a field named "score" whose value is set to be the mean of the values
+    sitting in field "score" of its immediate children nodes, and a field named "score_name" whose
+    value is set to be "group_mean".
+    When the input multistream consists of one single stream, it is returned as is, mainly for backward compatibility.
+    """
+    def update_intermediate_level_scores(self, level: dict) -> float:
+        if "score" in level:
+            return level["score"]
+            # the global score of the stream participating in this MultiStream
+        sub_scores = []
+        for key in level:
+            if isinstance(level[key], dict):
+                sub_scores.append(self.update_intermediate_level_scores(level[key]))
+        level.update({"score": nanmean(sub_scores), "score_name": "groups_mean"})
+        return level["score"]
     def process(self, multi_stream: MultiStream) -> MultiStream:
+        # each stream went through Metric which is a single-stream-operator , and ended up with all
+        # its instance["score"]["global"] linking to the same single dict object.
+        # Here we first generate a new, nested version, for the whole-ms-global_score, and then update
+        # each stream's global score with the new version
+        # but if only one stream in the multistream - we return it as is
         if len(multi_stream) == 1:
+            return multi_stream
+        global_score = {}
+        first_instances = {}
+        iterators = {}
         for stream_name, stream in multi_stream.items():
+            iterators[stream_name] = iter(stream)
+            try:
+                first_instances[stream_name] = next(iterators[stream_name])
+            except StopIteration:
+                continue  # an empty stream, goto next stream
+            instance = first_instances[stream_name]
+            dict_set(
+                dic=global_score,
+                query=stream_name.split("~")[-1],
+                value=deepcopy(instance["score"]["global"]),
+                not_exist_ok=True,
             )
+        self.update_intermediate_level_scores(global_score)
+        # update the global_score object for each stream. Recall that all instances
+        # in each stream link all to same python dict object
+        for stream_name in multi_stream.keys():
+            instance = first_instances[stream_name]
+            instance["score"]["global"].clear()
+            instance["score"]["global"].update(global_score)
+        def never_peek_twice_generator(
+            stream_name: str, first_instances: dict, iterators: dict
+        ) -> Generator:
+            while True:
+                if stream_name in first_instances:
+                    yield first_instances.pop(stream_name)
+                try:
+                    yield next(iterators[stream_name])
+                except StopIteration:
+                    return
+        return MultiStream(
+            {
+                stream_name: GeneratorStream(
+                    never_peek_twice_generator,
+                    gen_kwargs={
+                        "stream_name": stream_name,
+                        "first_instances": first_instances,
+                        "iterators": iterators,
+                    },
+                )
+                for stream_name in multi_stream.keys()
+            }
+        )
 class FromPredictionsAndOriginalData(StreamInitializerOperator):
     ) -> MultiStream:
         return MultiStream(
             {
+                split_name: GeneratorStream(
                     self.zip,
                     gen_kwargs={"predictions": predictions, "references": references},
                 )
 class MetricRecipe(SequentialOperatorInitializer):
     calc_confidence_intervals: bool = True
+    number_of_fusion_generations: int = 2
     def prepare(self):
         register_all_artifacts()
         self.steps = [
             FromPredictionsAndOriginalData(),
+            LoadJson(field="task_data"),
+            CopyFields(
+                field_to_field={
+                    "source": "task_data/source",
+                }
             ),
             ApplyOperatorsField(
                 operators_field="postprocessors",
             ),
+            SplitByNestedGroup(
+                field_name_of_group="group",
+                number_of_fusion_generations=self.number_of_fusion_generations,
+            ),
             ApplyMetric(
                 "metrics",
                 calc_confidence_intervals=self.calc_confidence_intervals,

metrics.py CHANGED Viewed

@@ -3,7 +3,7 @@ import string
 import uuid
 import warnings
 from abc import ABC, abstractmethod
-from collections import Counter
 from copy import deepcopy
 from dataclasses import field
 from statistics import mean
@@ -915,11 +915,15 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                             if uses_subgroups
                             else score_dict[default_subgroup_name]
                         )
-                        for score_name, score_dict in group_scores.items()
                     }
                 }
             }
-            for group_scores in group_to_instance_scores.values()
         ]
     def _set_up_group_mean_aggregation(
@@ -977,6 +981,40 @@ class Accuracy(InstanceMetric):
         return result
 class MaxAccuracy(Accuracy):
     """Calculate the maximal accuracy over all instances as the global score."""
@@ -1274,10 +1312,13 @@ class F1Binary(GlobalMetric):
     _metric = None
     metric = "f1"
     single_reference_per_prediction = True
     def prepare(self):
         super().prepare()
-        self._metric = evaluate.load(self.metric)
     def _validate_reference(self, reference):
         super()._validate_reference(reference)
@@ -1294,19 +1335,27 @@ class F1Binary(GlobalMetric):
     ) -> dict:
         flattened_int_references = [int(r[0]) for r in references]
         int_predictions = [int(p > self.threshold) for p in predictions]
-        result = self._metric.compute(
-            references=flattened_int_references,
-            predictions=int_predictions,
             labels=[0, 1],
             average=self.average,
         )
-        if isinstance(result[self.metric], numpy.ndarray):
             return {
-                self.main_score: result[self.metric][1],
-                f"{self.main_score}_neg": result[self.metric][0],
             }
-        return {self.main_score: result[self.metric]}
 class RecallBinary(F1Binary):
@@ -3358,6 +3407,7 @@ class BinaryMaxF1(F1Binary):
     main_score = "max_f1_binary"
     single_reference_per_prediction = True
     def compute(
         self,
@@ -3366,9 +3416,9 @@ class BinaryMaxF1(F1Binary):
         task_data: List[Dict],
     ) -> dict:
         best_thr = -1
-        best_f1 = -1
         best_thr_neg = -1
-        best_f1_neg = -1
         thrs = {round(fp, 3) for fp in predictions}
         for thr in thrs:
             new_predictions = [
@@ -3377,21 +3427,25 @@ class BinaryMaxF1(F1Binary):
             ]
             f1_results = super().compute(references, new_predictions, task_data)
-            f1 = f1_results[self.main_score]
-            if f1 > best_f1:
-                best_f1 = f1
                 best_thr = thr
-            f1_neg = f1_results[f"{self.main_score}_neg"]
-            if f1_neg > best_f1_neg:
-                best_f1_neg = f1_neg
                 best_thr_neg = thr
         return {
-            self.main_score: best_f1,
             "best_thr_maxf1": best_thr,
-            f"{self.main_score}_neg": best_f1_neg,
             "best_thr_maxf1_neg": best_thr_neg,
         }

 import uuid
 import warnings
 from abc import ABC, abstractmethod
+from collections import Counter, defaultdict
 from copy import deepcopy
 from dataclasses import field
 from statistics import mean
                             if uses_subgroups
                             else score_dict[default_subgroup_name]
                         )
+                        for score_name, score_dict in group_to_instance_scores[
+                            group_name
+                        ].items()
                     }
                 }
             }
+            for group_name in sorted(
+                group_to_instance_scores.keys()
+            )  # sorted for consistency
         ]
     def _set_up_group_mean_aggregation(
         return result
+class JaccardIndex(InstanceMetric):
+    reduction_map = {"mean": ["jaccard_index"]}
+    main_score = "jaccard_index"
+    ci_scores = ["jaccard_index"]
+    prediction_type = "Any"  # string representation is compared
+    def compute(
+        self, references: List[Any], prediction: Any, task_data: List[Dict]
+    ) -> dict:
+        if not isinstance(prediction, set):
+            prediction = set(prediction)
+        references = [set(reference) for reference in references]
+        result = {
+            self.main_score: max(
+                [
+                    float(
+                        (len(reference.intersection(prediction)))
+                        / (
+                            len(reference)
+                            + len(prediction)
+                            - len(reference.intersection(prediction))
+                        )
+                    )
+                    for reference in references
+                ]
+            )
+        }
+        result["score"] = result[self.main_score]
+        result["score_name"] = self.main_score
+        return result
 class MaxAccuracy(Accuracy):
     """Calculate the maximal accuracy over all instances as the global score."""
     _metric = None
     metric = "f1"
     single_reference_per_prediction = True
+    _requirements_list: List[str] = ["sklearn"]
     def prepare(self):
         super().prepare()
+        from sklearn import metrics
+        self._metric = metrics.precision_recall_fscore_support
     def _validate_reference(self, reference):
         super()._validate_reference(reference)
     ) -> dict:
         flattened_int_references = [int(r[0]) for r in references]
         int_predictions = [int(p > self.threshold) for p in predictions]
+        precision, recall, f1, _ = self._metric(
+            y_true=flattened_int_references,
+            y_pred=int_predictions,
             labels=[0, 1],
             average=self.average,
         )
+        if self.average is None:
             return {
+                "f1_binary": f1[1],
+                "f1_binary_neg": f1[0],
+                "recall_binary": recall[1],
+                "recall_binary_neg": recall[0],
+                "precision_binary": precision[1],
+                "precision_binary_neg": precision[0],
             }
+        return {"f1_binary": f1, "recall_binary": recall, "precision_binary": precision}
+class F1BinaryPosOnly(F1Binary):
+    average = "binary"
+    main_score = "f1_binary"
 class RecallBinary(F1Binary):
     main_score = "max_f1_binary"
     single_reference_per_prediction = True
+    average = None
     def compute(
         self,
         task_data: List[Dict],
     ) -> dict:
         best_thr = -1
+        best_f1 = defaultdict(lambda: -1)
         best_thr_neg = -1
+        best_f1_neg = defaultdict(lambda: -1)
         thrs = {round(fp, 3) for fp in predictions}
         for thr in thrs:
             new_predictions = [
             ]
             f1_results = super().compute(references, new_predictions, task_data)
+            f1 = f1_results["f1_binary"]
+            if f1 > best_f1["f1_binary"]:
+                best_f1 = f1_results.copy()
                 best_thr = thr
+            f1_neg = f1_results["f1_binary_neg"]
+            if f1_neg > best_f1_neg["f1_binary_neg"]:
+                best_f1_neg = f1_results.copy()
                 best_thr_neg = thr
         return {
+            self.main_score: best_f1["f1_binary"],
             "best_thr_maxf1": best_thr,
+            f"{self.main_score}_neg": best_f1_neg["f1_binary_neg"],
             "best_thr_maxf1_neg": best_thr_neg,
+            "recall_at_max_f1": best_f1["recall_binary"],
+            "recall_at_max_f1_neg": best_f1_neg["recall_binary_neg"],
+            "precision_at_max_f1": best_f1["precision_binary"],
+            "precision_at_max_f1_neg": best_f1_neg["precision_binary_neg"],
         }

operator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Dict, Generator, List, Optional, Union
 from .artifact import Artifact
 from .dataclass import InternalField, NonPositionalField
-from .stream import MultiStream, Stream
 from .utils import is_module_available
@@ -171,7 +171,9 @@ def instance_generator(instance):
 def stream_single(instance: Dict[str, Any]) -> Stream:
-    return Stream(generator=instance_generator, gen_kwargs={"instance": instance})
 class MultiStreamOperator(StreamingOperator):
@@ -244,7 +246,7 @@ class SingleStreamOperator(MultiStreamOperator):
     def _process_single_stream(
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Stream:
-        return Stream(
             self._process_stream,
             gen_kwargs={"stream": stream, "stream_name": stream_name},
         )
@@ -445,7 +447,7 @@ class InstanceOperatorWithMultiStreamAccess(StreamingOperator):
         result = {}
         for stream_name, stream in multi_stream.items():
-            stream = Stream(
                 self.generator,
                 gen_kwargs={"stream": stream, "multi_stream": multi_stream},
             )

 from .artifact import Artifact
 from .dataclass import InternalField, NonPositionalField
+from .stream import GeneratorStream, MultiStream, Stream
 from .utils import is_module_available
 def stream_single(instance: Dict[str, Any]) -> Stream:
+    return GeneratorStream(
+        generator=instance_generator, gen_kwargs={"instance": instance}
+    )
 class MultiStreamOperator(StreamingOperator):
     def _process_single_stream(
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Stream:
+        return GeneratorStream(
             self._process_stream,
             gen_kwargs={"stream": stream, "stream_name": stream_name},
         )
         result = {}
         for stream_name, stream in multi_stream.items():
+            stream = GeneratorStream(
                 self.generator,
                 gen_kwargs={"stream": stream, "multi_stream": multi_stream},
             )

operators.py CHANGED Viewed

@@ -37,7 +37,7 @@ import operator
 import uuid
 import zipfile
 from abc import abstractmethod
-from collections import Counter
 from copy import deepcopy
 from dataclasses import field
 from itertools import zip_longest
@@ -75,7 +75,7 @@ from .operator import (
 )
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
-from .stream import Stream
 from .text_utils import nested_tuple_to_string
 from .type_utils import isoftype
 from .utils import flatten_dict
@@ -490,7 +490,7 @@ class Augmentor(StreamInstanceOperator):
     Args:
         augment_model_input: Whether to augment the input to the model.
-        augment_task_input:  Whether to augment the task input fields.  The specific fields are defined in the FormTask operator.
     """
@@ -525,7 +525,7 @@ class Augmentor(StreamInstanceOperator):
         if self.augment_task_input:
             assert (
                 len(self._task_input_fields) > 0
-            ), "No augmentable input fields were defined in FormTask, and augmentation was requested. Specify the fields to augment in 'argumentable_inputs' attribute of the FormTask."
             fields = self._task_input_fields
             assert not self.augment_model_input
@@ -860,6 +860,51 @@ class ZipFieldValues(StreamInstanceOperator):
         return instance
 class IndexOf(StreamInstanceOperator):
     """For a given instance, finds the offset of value of field 'index_of', within the value of field 'search_in'."""
@@ -1560,6 +1605,52 @@ class SplitByValue(MultiStreamOperator):
         return MultiStream(result)
 class ApplyStreamOperatorsField(SingleStreamOperator, ArtifactFetcherMixin):
     """Applies stream operators to a stream based on specified fields in each instance.
@@ -1668,7 +1759,7 @@ class MergeStreams(MultiStreamOperator):
     add_origin_stream_name: bool = True
     origin_stream_name_field_name: str = "origin"
-    def merge(self, multi_stream):
         for stream_name, stream in multi_stream.items():
             if self.streams_to_merge is None or stream_name in self.streams_to_merge:
                 for instance in stream:
@@ -1679,7 +1770,7 @@ class MergeStreams(MultiStreamOperator):
     def process(self, multi_stream: MultiStream) -> MultiStream:
         return MultiStream(
             {
-                self.new_stream_name: Stream(
                     self.merge, gen_kwargs={"multi_stream": multi_stream}
                 )
             }

 import uuid
 import zipfile
 from abc import abstractmethod
+from collections import Counter, defaultdict
 from copy import deepcopy
 from dataclasses import field
 from itertools import zip_longest
 )
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
+from .stream import GeneratorStream, Stream
 from .text_utils import nested_tuple_to_string
 from .type_utils import isoftype
 from .utils import flatten_dict
     Args:
         augment_model_input: Whether to augment the input to the model.
+        augment_task_input:  Whether to augment the task input fields.  The specific fields are defined in the Task operator.
     """
         if self.augment_task_input:
             assert (
                 len(self._task_input_fields) > 0
+            ), "No augmentable input fields were defined in Task, and augmentation was requested. Specify the fields to augment in 'argumentable_inputs' attribute of the Task."
             fields = self._task_input_fields
             assert not self.augment_model_input
         return instance
+class InterleaveListsToDialogOperator(StreamInstanceOperator):
+    """Interleaves two lists, one of user dialog turns and one of assistant dialog turns, into a single list of tuples, alternating between "user" and "assistant".
+     The list of tuples if of format (role, turn_content), where the role label is specified by
+     the 'user_role_label' and 'assistant_role_label' fields (default to "user" and "assistant").
+    The user turns and assistant turns field are specified in the arguments.
+     The value of each of the 'fields' is assumed to be a list.
+    """
+    user_turns_field: str
+    assistant_turns_field: str
+    user_role_label: str = "user"
+    assistant_role_label: str = "assistant"
+    to_field: str
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        user_turns = instance[self.user_turns_field]
+        assistant_turns = instance[self.assistant_turns_field]
+        assert (
+            len(user_turns) == len(assistant_turns)
+            or (len(user_turns) - len(assistant_turns) == 1)
+        ), "user_turns must have either the same length as assistant_turns or one more turn."
+        interleaved_dialog = []
+        i, j = 0, 0  # Indices for the user and assistant lists
+        # While either list has elements left, continue interleaving
+        while i < len(user_turns) or j < len(assistant_turns):
+            if i < len(user_turns):
+                interleaved_dialog.append((self.user_role_label, user_turns[i]))
+                i += 1
+            if j < len(assistant_turns):
+                interleaved_dialog.append(
+                    (self.assistant_role_label, assistant_turns[j])
+                )
+                j += 1
+        instance[self.to_field] = interleaved_dialog
+        return instance
 class IndexOf(StreamInstanceOperator):
     """For a given instance, finds the offset of value of field 'index_of', within the value of field 'search_in'."""
         return MultiStream(result)
+class SplitByNestedGroup(MultiStreamOperator):
+    """Splits a MultiStream that is small - for metrics, hence: whole stream can sit in memory, split by the value of field 'group'.
+    Args:
+        number_of_fusion_generations: int
+    the value in field group is of the form "sourcen/sourcenminus1/..." describing the sources in which the instance sat
+    when these were fused, potentially several phases of fusion. the name of the most recent source sits first in this value.
+    (See BaseFusion and its extensions)
+    number_of_fuaion_generations  specifies the length of the prefix by which to split the stream.
+    E.g. for number_of_fusion_generations = 1, only the most recent fusion in creating this multi_stream, affects the splitting.
+    For number_of_fusion_generations = -1, take the whole history written in this field, ignoring number of generations.
+    """
+    field_name_of_group: str = "group"
+    number_of_fusion_generations: int = 1
+    def process(self, multi_stream: MultiStream) -> MultiStream:
+        result = defaultdict(list)
+        for stream_name, stream in multi_stream.items():
+            for instance in stream:
+                if self.field_name_of_group not in instance:
+                    raise ValueError(
+                        f"Field {self.field_name_of_group} is missing from instance {instance}"
+                    )
+                signature = (
+                    stream_name
+                    + "~"  #  a sign that does not show within group values
+                    + (
+                        "/".join(
+                            instance[self.field_name_of_group].split("/")[
+                                : self.number_of_fusion_generations
+                            ]
+                        )
+                        if self.number_of_fusion_generations >= 0
+                        # for values with a smaller number of generations - take up to their last generation
+                        else instance[self.field_name_of_group]
+                        # for each instance - take all its generations
+                    )
+                )
+                result[signature].append(instance)
+        return MultiStream.from_iterables(result)
 class ApplyStreamOperatorsField(SingleStreamOperator, ArtifactFetcherMixin):
     """Applies stream operators to a stream based on specified fields in each instance.
     add_origin_stream_name: bool = True
     origin_stream_name_field_name: str = "origin"
+    def merge(self, multi_stream) -> Generator:
         for stream_name, stream in multi_stream.items():
             if self.streams_to_merge is None or stream_name in self.streams_to_merge:
                 for instance in stream:
     def process(self, multi_stream: MultiStream) -> MultiStream:
         return MultiStream(
             {
+                self.new_stream_name: GeneratorStream(
                     self.merge, gen_kwargs={"multi_stream": multi_stream}
                 )
             }

processors.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import re
 from difflib import get_close_matches
@@ -54,14 +55,6 @@ class ExtractWithRegex(RegexParser):
         return ""
-class LoadJson(FieldOperator):
-    def process_value(self, text: Any) -> Any:
-        try:
-            return json.loads(text)
-        except json.JSONDecodeError:
-            return []
 class ListToEmptyEntitiesTuples(FieldOperator):
     def process_value(self, lst: Any) -> Any:
         try:
@@ -225,10 +218,30 @@ class StringOrNotString(FieldOperator):
         return text
-class ExtractMtBenchJudgment(FieldOperator):
     def process_value(self, text: Any) -> Any:
         match = re.search(r"\[\[([\d]+\.?[\d]*)\]\]", text)
         try:
             return float(match.group(1)) / 10
         except:
             return 0.0

+import ast
 import json
 import re
 from difflib import get_close_matches
         return ""
 class ListToEmptyEntitiesTuples(FieldOperator):
     def process_value(self, lst: Any) -> Any:
         try:
         return text
+class ExtractMtBenchRatingJudgment(FieldOperator):
     def process_value(self, text: Any) -> Any:
         match = re.search(r"\[\[([\d]+\.?[\d]*)\]\]", text)
         try:
             return float(match.group(1)) / 10
         except:
             return 0.0
+class ExtractMtBenchLabelJudgment(FieldOperator):
+    def process_value(self, text: Any) -> Any:
+        match = re.search(r"\[\[([^\]]+)\]\]", text)
+        try:
+            return str(match.group(1))
+        except:
+            return "None"
+class LiteralEval(FieldOperator):
+    def process_value(self, text: Any) -> Any:
+        if text is not None and not isinstance(text, str):
+            raise ValueError(
+                f"LiteralEval: field '{self.field}' is expected to be of 'str' input type, got: {type(text)}"
+            )
+        if text is None or text == "":
+            return text
+        return ast.literal_eval(text.strip())

schema.py CHANGED Viewed

@@ -34,16 +34,24 @@ class ToUnitxtGroup(StreamInstanceOperatorValidator):
     postprocessors: List[str] = field(default_factory=lambda: ["to_string_stripped"])
     remove_unnecessary_fields: bool = True
-    def _to_lists_of_keys_and_values(self, dict: Dict[str, str]):
-        return {
-            "key": [key for key, _ in dict.items()],
-            "value": [str(value) for _, value in dict.items()],
-        }
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
-        task_data = {**instance["inputs"], **instance["outputs"]}
         instance["task_data"] = json.dumps(task_data)
         if self.remove_unnecessary_fields:

     postprocessors: List[str] = field(default_factory=lambda: ["to_string_stripped"])
     remove_unnecessary_fields: bool = True
+    @staticmethod
+    def artifact_to_jsonable(artifact):
+        if artifact.__id__ is None:
+            return artifact.to_dict()
+        return artifact.__id__
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
+        task_data = {
+            **instance["inputs"],
+            **instance["outputs"],
+            "metadata": {
+                "template": self.artifact_to_jsonable(
+                    instance["recipe_metadata"]["template"]
+                )
+            },
+        }
         instance["task_data"] = json.dumps(task_data)
         if self.remove_unnecessary_fields:

settings_utils.py CHANGED Viewed

@@ -126,13 +126,14 @@ if Settings.is_uninitilized():
     settings.max_log_message_size = (int, 100000)
     settings.artifactories = None
     settings.default_recipe = "standard_recipe"
-    settings.default_verbosity = "debug"
     settings.remote_metrics = []
     settings.allow_passing_data_to_remote_api = (bool, False)
     settings.test_card_disable = (bool, False)
     settings.test_metric_disable = (bool, False)
     settings.metrics_master_key_token = None
     settings.seed = (int, 42)
 if Constants.is_uninitilized():
     constants = Constants()

     settings.max_log_message_size = (int, 100000)
     settings.artifactories = None
     settings.default_recipe = "standard_recipe"
+    settings.default_verbosity = "info"
     settings.remote_metrics = []
     settings.allow_passing_data_to_remote_api = (bool, False)
     settings.test_card_disable = (bool, False)
     settings.test_metric_disable = (bool, False)
     settings.metrics_master_key_token = None
     settings.seed = (int, 42)
+    settings.skip_artifacts_prepare_and_verify = (bool, False)
 if Constants.is_uninitilized():
     constants = Constants()

splitters.py CHANGED Viewed

@@ -196,9 +196,12 @@ class DiverseLabelsSampler(Sampler):
             raise ValueError(f"'{self.choices}' field is missing from '{inputs}'.")
         choices = inputs[self.choices]
         if not isinstance(choices, list):
-            raise ValueError(
-                f"Unexpected input choices value '{choices}'. Expected a list."
-            )
         if "outputs" not in exemplar:
             raise ValueError(f"'outputs' field is missing from '{exemplar}'.")

             raise ValueError(f"'{self.choices}' field is missing from '{inputs}'.")
         choices = inputs[self.choices]
         if not isinstance(choices, list):
+            if isinstance(choices, str):
+                choices = [choices]
+            else:
+                raise ValueError(
+                    f"Unexpected input choices value '{choices}'. Expected a list or a string."
+                )
         if "outputs" not in exemplar:
             raise ValueError(f"'outputs' field is missing from '{exemplar}'.")

standard.py CHANGED Viewed

@@ -135,6 +135,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
             self.metadata,
             self.standardization,
             self.processing,
             self.verblization,
             self.finalize,
         ]
@@ -144,6 +145,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
         self.inference_instance.steps = [
             self.metadata,
             self.processing,
         ]
         self.inference_demos = SourceSequentialOperator()
@@ -153,6 +155,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
             self.metadata,
             self.standardization,
             self.processing,
         ]
         self.inference = SequentialOperator()

             self.metadata,
             self.standardization,
             self.processing,
+            self.metadata,
             self.verblization,
             self.finalize,
         ]
         self.inference_instance.steps = [
             self.metadata,
             self.processing,
+            self.metadata,
         ]
         self.inference_demos = SourceSequentialOperator()
             self.metadata,
             self.standardization,
             self.processing,
+            self.metadata,
         ]
         self.inference = SequentialOperator()

stream.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import tempfile
-from typing import Dict, Iterable
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
@@ -8,6 +9,36 @@ from .generator_utils import CopyingReusableGenerator, ReusableGenerator
 class Stream(Dataclass):
     """A class for handling streaming data in a customizable way.
     This class provides methods for generating, caching, and manipulating streaming data.
@@ -18,8 +49,8 @@ class Stream(Dataclass):
         caching (bool): Whether the data is cached or not. :no-index:
     """
-    generator: callable
-    gen_kwargs: Dict[str, any] = OptionalField(default_factory=dict)
     caching: bool = False
     copying: bool = False
@@ -147,7 +178,7 @@ class MultiStream(dict):
         assert all(isinstance(v, ReusableGenerator) for v in generators.values())
         return cls(
             {
-                key: Stream(
                     generator.generator,
                     gen_kwargs=generator.gen_kwargs,
                     caching=caching,
@@ -173,7 +204,7 @@ class MultiStream(dict):
         """
         return cls(
             {
-                key: Stream(
                     iterable.__iter__,
                     caching=caching,
                     copying=copying,

 import tempfile
+from abc import abstractmethod
+from typing import Any, Callable, Dict, Iterable, List
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 class Stream(Dataclass):
+    @abstractmethod
+    def __iter__(self):
+        pass
+    @abstractmethod
+    def peek(self):
+        pass
+    @abstractmethod
+    def take(self, n):
+        pass
+class ListStream(Stream):
+    instances_list: List[Dict[str, Any]]
+    def __iter__(self):
+        return iter(self.instances_list)
+    def peek(self):
+        return next(iter(self.instances_list))
+    def take(self, n):
+        for i, instance in enumerate(self.instances_list):
+            if i >= n:
+                break
+            yield instance
+class GeneratorStream(Stream):
     """A class for handling streaming data in a customizable way.
     This class provides methods for generating, caching, and manipulating streaming data.
         caching (bool): Whether the data is cached or not. :no-index:
     """
+    generator: Callable
+    gen_kwargs: Dict[str, Any] = OptionalField(default_factory=dict)
     caching: bool = False
     copying: bool = False
         assert all(isinstance(v, ReusableGenerator) for v in generators.values())
         return cls(
             {
+                key: GeneratorStream(
                     generator.generator,
                     gen_kwargs=generator.gen_kwargs,
                     caching=caching,
         """
         return cls(
             {
+                key: GeneratorStream(
                     iterable.__iter__,
                     caching=caching,
                     copying=copying,

string_operators.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import re
-from typing import List
-from .operators import FieldOperator
 class Split(FieldOperator):
@@ -39,6 +44,17 @@ class Join(FieldOperator):
         return self.by.join(value)
 class Strip(FieldOperator):
     def process_value(self, value: str) -> str:
         return value.strip()

 import re
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+)
+from .operators import FieldOperator, StreamInstanceOperator
 class Split(FieldOperator):
         return self.by.join(value)
+class FormatText(StreamInstanceOperator):
+    to_field: str
+    text: str
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        instance[self.to_field] = self.text.format(**instance)
+        return instance
 class Strip(FieldOperator):
     def process_value(self, value: str) -> str:
         return value.strip()

struct_data_operators.py CHANGED Viewed

@@ -547,3 +547,22 @@ class ShuffleTableColumns(FieldOperator):
         table_content["rows"] = shuffled_rows
         return table_content

         table_content["rows"] = shuffled_rows
         return table_content
+class LoadJson(FieldOperator):
+    failure_value: Any = None
+    allow_failure: bool = False
+    def process_value(self, value: str) -> Any:
+        if self.allow_failure:
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return self.failure_value
+        else:
+            return json.loads(value)
+class DumpJson(FieldOperator):
+    def process_value(self, value: str) -> str:
+        return json.dumps(value)

task.py CHANGED Viewed

@@ -13,11 +13,7 @@ from .type_utils import (
 )
-class Tasker:
-    pass
-class FormTask(Tasker, StreamInstanceOperator):
     """FormTask packs the different instance fields into dictionaries by their roles in the task.
     Attributes:
@@ -119,49 +115,5 @@ class FormTask(Tasker, StreamInstanceOperator):
         }
-class MultipleChoiceTask(FormTask):
-    choices_field: str = "choices"
-    choices_separator: str = "\n"
-    enumeration_suffix: str = ". "
-    use_text_in_target: bool = False
-    alphabet: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    def process_single_choice(
-        self, choice: str, index: int, use_text: bool = True
-    ) -> str:
-        try:
-            processed_choice = f"{self.alphabet[index]}"
-        except IndexError as e:
-            raise ValueError(
-                f"Too many choices, the length of alphabet '{self.alphabet}': {len(self.alphabet)} is the limit"
-            ) from e
-        if use_text:
-            processed_choice += f"{self.enumeration_suffix}{choice}"
-        return processed_choice
-    def process_choices(self, choices: List[str]) -> str:
-        processed_choices = []
-        for index, choice in enumerate(choices):
-            processed_choices.append(self.process_single_choice(choice, index))
-        return self.choices_separator.join(processed_choices)
-    def process_target(self, choices, target_index):
-        return self.process_single_choice(
-            choices[target_index], target_index, use_text=self.use_text_in_target
-        )
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        result = super().process(instance, stream_name)
-        target_key, target_value = next(iter(result["outputs"].items()))
-        choices = result["inputs"][self.choices_field]
-        target_index_in_choices = choices.index(target_value)
-        processed_choices = self.process_choices(choices)
-        processed_target = self.process_target(choices, target_index_in_choices)
-        result["inputs"][self.choices_field] = processed_choices
-        result["outputs"][target_key] = processed_target
-        return result

 )
+class Task(StreamInstanceOperator):
     """FormTask packs the different instance fields into dictionaries by their roles in the task.
     Attributes:
         }
+class FormTask(Task):
+    pass

templates.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 from abc import abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 from .collections import ListCollection
 from .dataclass import NonPositionalField
 from .operator import StreamInstanceOperator
@@ -48,6 +50,11 @@ class Template(StreamInstanceOperator):
         )
         return instruction, target_prefix
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
@@ -61,9 +68,9 @@ class Template(StreamInstanceOperator):
         inputs = instance.get("inputs")
         outputs = instance.get("outputs")
         self.set_titles(inputs)
         source = self.inputs_to_source(inputs)
         instruction, target_prefix = self.inputs_to_instruction_and_target_prefix(
             inputs
@@ -150,6 +157,135 @@ class InputOutputTemplateWithCustomTarget(InputOutputTemplate):
         return target, [reference]
 class MultipleChoiceTemplate(Template):
     """Formats the input (that specifies the question), the multiple choices to select the answer from, and specifies the field with the correct answer."""
@@ -328,25 +464,20 @@ class YesNoTemplate(Template):
             raise RuntimeError(
                 f"Available outputs are {list(outputs.keys())}, missing required label field: '{self.label_field}'."
             ) from e
-        if not isinstance(gold_class_names, list) or not gold_class_names:
             raise RuntimeError(
-                f"Unexpected value for gold_class_names: '{gold_class_names}'. Expected a non-empty list."
             )
         try:
-            queried_class_names = outputs[self.class_field]
         except KeyError as e:
             raise RuntimeError(
                 f"Available outputs are {list(outputs.keys())}, missing required class field: '{self.class_field}'."
             ) from e
-        if (
-            not queried_class_names
-            or not isinstance(queried_class_names, list)
-            or not len(queried_class_names) == 1
-        ):
             raise RuntimeError(
-                f"Unexpected value for queried_class_names: '{queried_class_names}'. Expected a list with one item."
             )
-        queried_class_name = queried_class_names[0]
         if queried_class_name in gold_class_names:
             return self.yes_answer, [self.yes_answer]
         return self.no_answer, [self.no_answer]

 import json
 from abc import abstractmethod
+from random import random
 from typing import Any, Dict, List, Optional, Tuple, Union
+from .artifact import Artifact
 from .collections import ListCollection
 from .dataclass import NonPositionalField
 from .operator import StreamInstanceOperator
         )
         return instruction, target_prefix
+    def preprocess_inputs_and_outputs(
+        self, inputs: Dict[str, Any], outputs: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        return inputs, outputs
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         inputs = instance.get("inputs")
         outputs = instance.get("outputs")
+        inputs, outputs = self.preprocess_inputs_and_outputs(inputs, outputs)
         self.set_titles(inputs)
         source = self.inputs_to_source(inputs)
         instruction, target_prefix = self.inputs_to_instruction_and_target_prefix(
             inputs
         return target, [reference]
+class PairwiseChoiceTemplate(InputOutputTemplate):
+    """PairwiseChoiceTemplate.
+    Requirements:
+     The answer field value should be of type Literal["choice_a", "choice_b", "tie"]
+    Args:
+         choice_a_field (str): The field which contains choice_a value
+         choice_b_field (str): The field which contains choice_b value
+         answer_field (str): The field which contains the answer value.
+           Should be of type Literal["choice_1", "choice_2", "tie"]
+         choice_a_label (str): The label of choice A answer as it is verbalized in the template.
+         choice_b_label (str): The label of choice B answer as it is verbalized in the template.
+         choice_tie_label (str): The label of a tie answer as it should be verbalized in the template.
+         shuffle (bool): whether to shuffle the choices or not. This is done to take into account position bias.
+    shuffle: 50% of the time:
+     1) The values of choice_a_field and choice_b_field will be swapped.
+     2) If the values of answer_field is choice_a_label, set it to choice_b_label.
+         Else if the values of answer_field is choice_b_label, set it to choice_a_label.
+         Else if the value of answer_field is choice_tie_label, do nothing.
+    """
+    choice_a_field: str
+    choice_b_field: str
+    answer_field: str
+    choice_a_label: str
+    choice_b_label: str
+    choice_tie_label: str
+    shuffle: bool
+    def verbalize_answer_field(self, outputs: Dict[str, object]):
+        answer = outputs[self.answer_field]
+        assert answer in ["choice_a", "choice_b", "tie"]
+        if answer == "choice_a":
+            outputs[self.answer_field] = self.choice_a_label
+        elif answer == "choice_b":
+            outputs[self.answer_field] = self.choice_b_label
+        else:
+            outputs[self.answer_field] = self.choice_tie_label
+        return outputs
+    def shuffle_values(self, inputs: Dict[str, object], outputs: Dict[str, object]):
+        outcome = random()  # A float between 0 and 1
+        if outcome <= 0.5:
+            choice_a_value = inputs[self.choice_a_field]
+            choice_b_value = inputs[self.choice_b_field]
+            inputs[self.choice_a_field] = choice_a_value
+            inputs[self.choice_b_field] = choice_b_value
+            answer = outputs[self.answer_field]
+            assert answer in [
+                self.choice_a_label,
+                self.choice_b_label,
+                self.choice_tie_label,
+            ]
+            if answer == self.choice_a_label:
+                outputs[self.answer_field] = self.choice_b_label
+            elif answer == self.choice_b_label:
+                outputs[self.answer_field] = self.choice_a_label
+        return inputs, outputs
+    def preprocess_inputs_and_outputs(
+        self, inputs: Dict[str, Any], outputs: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        outputs = self.verbalize_answer_field(outputs)
+        inputs, outputs = self.shuffle_values(inputs, outputs)
+        return inputs, outputs
+class DialogFieldsData(Artifact):
+    user_role_label: str
+    assistant_role_label: str
+    system_role_label: str
+    dialog_field: str
+class DialogTemplate(InputOutputTemplate):
+    dialog_fields: List[DialogFieldsData]
+    turns_separator: str = "\n\n"
+    label_separator: str = " "
+    def process_dialog(self, inputs: Dict[str, object]):
+        for dialog_fields in self.dialog_fields:
+            dialog = inputs[dialog_fields.dialog_field]
+            # TODO: update isoftype method to support Literal verification and check
+            #  it's List[Tuple[Literal["user", "assistant", "system"], str]] (Issue #799)
+            assert isoftype(dialog, List[Tuple[str, str]])
+            user_role_label = dialog_fields.user_role_label
+            assistant_role_label = dialog_fields.assistant_role_label
+            system_role_label = dialog_fields.system_role_label
+            dialog_str = ""
+            for i, turn in enumerate(dialog):
+                (turn_type, turn_text) = turn
+                turns_separator = "" if i == 0 else self.turns_separator
+                if turn_type == "user":
+                    dialog_str += f"{turns_separator}{user_role_label}{self.label_separator}{turn_text}"
+                elif turn_type == "assistant":
+                    dialog_str += f"{turns_separator}{assistant_role_label}{self.label_separator}{turn_text}"
+                elif turn_type == "system":
+                    dialog_str += f"{turns_separator}{system_role_label}{self.label_separator}{turn_text}"
+            inputs[dialog_fields.dialog_field] = dialog_str
+        return inputs
+    def preprocess_inputs_and_outputs(
+        self, inputs: Dict[str, Any], outputs: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        return self.process_dialog(inputs), outputs
+class DialogPairwiseChoiceTemplate(DialogTemplate, PairwiseChoiceTemplate):
+    def preprocess_inputs_and_outputs(
+        self, inputs: Dict[str, Any], outputs: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, outputs = DialogTemplate.preprocess_inputs_and_outputs(
+            self, inputs, outputs
+        )
+        return PairwiseChoiceTemplate.preprocess_inputs_and_outputs(
+            self, inputs, outputs
+        )
 class MultipleChoiceTemplate(Template):
     """Formats the input (that specifies the question), the multiple choices to select the answer from, and specifies the field with the correct answer."""
             raise RuntimeError(
                 f"Available outputs are {list(outputs.keys())}, missing required label field: '{self.label_field}'."
             ) from e
+        if not isinstance(gold_class_names, list):
             raise RuntimeError(
+                f"Unexpected value for gold_class_names: '{gold_class_names}'. Expecting a list."
             )
         try:
+            queried_class_name = outputs[self.class_field]
         except KeyError as e:
             raise RuntimeError(
                 f"Available outputs are {list(outputs.keys())}, missing required class field: '{self.class_field}'."
             ) from e
+        if not queried_class_name or not isinstance(queried_class_name, str):
             raise RuntimeError(
+                f"Unexpected value for queried_class_names: '{queried_class_name}'. Expected a string."
             )
         if queried_class_name in gold_class_names:
             return self.yes_answer, [self.yes_answer]
         return self.no_answer, [self.no_answer]

text_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 import shutil
 from .logging_utils import get_logger
@@ -129,3 +130,42 @@ def nested_tuple_to_string(nested_tuple: tuple) -> str:
 def is_made_of_sub_strings(string, sub_strings):
     pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
     return bool(re.match(pattern, string))

 import re
 import shutil
+from typing import List, Tuple
 from .logging_utils import get_logger
 def is_made_of_sub_strings(string, sub_strings):
     pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
     return bool(re.match(pattern, string))
+# Giveמ all the lines of a file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
+# and an object name, e.g. TaskCard,
+# return the ordinal number of the line that starts that object, in our example: the
+# line number of the following line (notice that the line where TaskCard is imported
+# is not supposed to return):
+#         card = TaskCard(
+# and the line number of the line that ends the object, in our case the line that include
+# the matching close:
+#         )
+# This util depends on ruff to ensure this setting of the card file: that a close of one
+# tag and the open of the next tag, do not sit in same line, both tags being
+# major level within TaskCard
+# flake8: noqa: B007
+def lines_defining_obj(
+    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
+) -> Tuple[int, int]:
+    for starting_line in range(start_search_at_line, len(all_lines)):
+        line = all_lines[starting_line]
+        if obj_name in line:
+            break
+    if obj_name not in line:
+        # obj_name found no where in the input lines
+        return (-1, -1)
+    num_of_opens = 0
+    num_of_closes = 0
+    for ending_line in range(starting_line, len(all_lines)):
+        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
+        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
+        if num_of_closes == num_of_opens:
+            break
+    if num_of_closes != num_of_opens:
+        raise ValueError(
+            "input lines were exhausted before the matching close is found"
+        )
+    return (starting_line, ending_line)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.8.1"


1	+ version = "1.9.0"