Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Nov 22, 2024

Commit

d443ad5

verified ·

1 Parent(s): cea5047

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

augmentors.py +3 -6
image_operators.py +12 -0
inference.py +1381 -426
llm_as_judge.py +14 -2
loaders.py +9 -9
metrics.py +7 -0
operators.py +15 -8
settings_utils.py +1 -1
standard.py +6 -9
task.py +23 -19
text_utils.py +2 -1
version.py +1 -1

augmentors.py CHANGED Viewed

@@ -49,7 +49,7 @@ class TextAugmentor(TypeDependentAugmentor):
     augmented_type = Text
-class NullAugmentor(Augmentor):
     """Does not change the input string."""
     def process_value(self, value: Any) -> Any:
@@ -83,12 +83,9 @@ class AugmentPrefixSuffix(TextAugmentor):
     r"""Augments the input by prepending and appending randomly selected (typically, whitespace) patterns.
     Args:
-     prefixes, suffixes (list or dict) : the potential (typically, whitespace) patterns to select from.
-        The dictionary version allows the specification relative weights for the different patterns.
      prefix_len, suffix_len (positive int) : The added prefix or suffix will be of a certain length.
-     remove_existing_whitespaces : Clean any existing leading and trailing whitespaces.
-        The strings made of repetitions of the selected pattern(s) are then prepended and/or appended to the potentially
-        trimmed input.
      If only either just prefixes or just suffixes are needed, set the other to None.
     Examples:

     augmented_type = Text
+class NullAugmentor(TaskInputsAugmentor):
     """Does not change the input string."""
     def process_value(self, value: Any) -> Any:
     r"""Augments the input by prepending and appending randomly selected (typically, whitespace) patterns.
     Args:
+     prefixes, suffixes (list or dict) : the potential patterns (typically, whitespace) to select from. The dictionary version allows the specification relative weights for the different patterns.
      prefix_len, suffix_len (positive int) : The added prefix or suffix will be of a certain length.
+     remove_existing_whitespaces : Clean any existing leading and trailing whitespaces. The strings made of repetitions of the selected pattern(s) are then prepended and/or appended to the potentially trimmed input.
      If only either just prefixes or just suffixes are needed, set the other to None.
     Examples:

image_operators.py CHANGED Viewed

@@ -93,6 +93,18 @@ def extract_images(text, instance):
     return images
 class DecodeImage(FieldOperator, PillowMixin):
     def process_value(self, value: str) -> Any:
         image_data = base64.b64decode(value)

     return images
+class EncodeImageToString(FieldOperator):
+    image_format: str = "JPEG"
+    def encode_image_to_base64(self, image):
+        buffer = io.BytesIO()
+        image.save(buffer, format=self.image_format)
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+    def process_value(self, value: Any) -> Any:
+        return {"image": self.encode_image_to_base64(value)}
 class DecodeImage(FieldOperator, PillowMixin):
     def process_value(self, value: str) -> Any:
         image_data = base64.b64decode(value)

inference.py CHANGED Viewed

@@ -9,7 +9,18 @@ import sys
 import time
 import uuid
 from collections import Counter
-from typing import Any, Dict, List, Literal, Optional, Union
 from datasets import DatasetDict
 from tqdm import tqdm, trange
@@ -19,11 +30,12 @@ from .artifact import Artifact
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
 from .error_utils import UnitxtError
-from .image_operators import data_url_to_image, extract_images
 from .logging_utils import get_logger
 from .operator import PackageRequirementsMixin
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants, get_settings
 constants = get_constants()
 settings = get_settings()
@@ -67,6 +79,9 @@ class TextGenerationInferenceOutput:
     input_tokens (int) : number of input tokens to the model.
     output_tokens (int) : number of output tokens to the model.
     model_name (str): the model_name as kept in the InferenceEngine.
     inference_type (str): The label stating the type of the InferenceEngine.
     """
@@ -74,6 +89,9 @@ class TextGenerationInferenceOutput:
     prediction: Union[str, List[Dict[str, Any]]]
     input_tokens: Optional[int] = None
     output_tokens: Optional[int] = None
     model_name: Optional[str] = None
     inference_type: Optional[str] = None
@@ -152,6 +170,10 @@ class InferenceEngine(Artifact):
                 if param_inst_val is None:
                     setattr(self, param, param_dict_val)
     def verify_not_chat_api(self, dataset):
         if isinstance(dataset[0]["source"], list):
             raise NotImplementedError(
@@ -216,259 +238,898 @@ class LazyLoadMixin(Artifact):
         pass
-class HFPipelineBasedInferenceEngine(
-    InferenceEngine, PackageRequirementsMixin, LazyLoadMixin
-):
-    model_name: str
     max_new_tokens: int
-    use_fp16: bool = True
-    batch_size: int = 1
     top_k: Optional[int] = None
     _requirements_list = {
-        "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
-    def get_engine_id(self):
-        return get_model_and_label_id(self.model_name, "hf_pipeline")
-    def _get_task(self):
-        from transformers import AutoConfig
-        return (
-            "text2text-generation"
-            if AutoConfig.from_pretrained(
-                self.model_name, trust_remote_code=True
-            ).is_encoder_decoder
-            else "text-generation"
-        )
-    def _prepare_pipeline(self):
-        import torch
-        from transformers import pipeline
-        model_args: Dict[str, Any] = (
-            {"torch_dtype": torch.float16} if self.use_fp16 else {}
-        )
-        model_args.update({"max_new_tokens": self.max_new_tokens})
-        device = torch.device(
-            "mps"
-            if torch.backends.mps.is_available()
-            else 0
-            if torch.cuda.is_available()
-            else "cpu"
-        )
-        # We do this, because in some cases, using device:auto will offload some weights to the cpu
-        # (even though the model might *just* fit to a single gpu), even if there is a gpu available, and this will
-        # cause an error because the data is always on the gpu
-        if torch.cuda.device_count() > 1:
-            assert device == torch.device(0)
-            model_args.update({"device_map": "auto"})
-        else:
-            model_args.update({"device": device})
-        task = self._get_task()
-        if task == "text-generation":
-            model_args.update({"return_full_text": False})
-        self.model = pipeline(
-            model=self.model_name, trust_remote_code=True, **model_args
-        )
     def prepare_engine(self):
         if not self.lazy_load:
-            self._prepare_pipeline()
-    def _is_loaded(self):
-        return hasattr(self, "model") and self.model is not None
-    def _infer(
-        self,
-        dataset: Union[List[Dict[str, Any]], DatasetDict],
-        return_meta_data: bool = False,
-    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        if self._get_task() == "text2text-generation":
-            self.verify_not_chat_api(dataset)
-        if not self._is_loaded():
-            self._prepare_pipeline()
-        outputs = []
-        for output in self.model(
-            [instance["source"] for instance in dataset],
-            batch_size=self.batch_size,
-            top_k=self.top_k,
-        ):
-            if isinstance(output, list):
-                output = output[0]
-            outputs.append(output["generated_text"])
-        return outputs
-class MockInferenceEngine(InferenceEngine):
-    model_name: str
-    default_inference_value: str = "[[10]]"
-    def get_engine_id(self):
-        return get_model_and_label_id(self.model_name, "mock")
-    def prepare_engine(self):
-        return
-    def _mock_infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        return [self.default_inference_value for _ in dataset]
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        return self._mock_infer(dataset)
-class MockModeMixin(Artifact):
-    mock_mode: bool = False
-class IbmGenAiInferenceEngineParamsMixin(Artifact):
-    beam_width: Optional[int] = None
-    decoding_method: Optional[Literal["greedy", "sample"]] = None
-    include_stop_sequence: Optional[bool] = None
-    length_penalty: Any = None
-    max_new_tokens: Optional[int] = None
-    min_new_tokens: Optional[int] = None
-    random_seed: Optional[int] = None
-    repetition_penalty: Optional[float] = None
-    return_options: Any = None
-    stop_sequences: Optional[List[str]] = None
-    temperature: Optional[float] = None
-    time_limit: Optional[int] = None
-    top_k: Optional[int] = None
-    top_p: Optional[float] = None
-    truncate_input_tokens: Optional[int] = None
-    typical_p: Optional[float] = None
-@deprecation(version="2.0.0", alternative=IbmGenAiInferenceEngineParamsMixin)
-class IbmGenAiInferenceEngineParams(Artifact):
-    beam_width: Optional[int] = None
-    decoding_method: Optional[Literal["greedy", "sample"]] = None
-    include_stop_sequence: Optional[bool] = None
-    length_penalty: Any = None
-    max_new_tokens: Optional[int] = None
-    min_new_tokens: Optional[int] = None
-    random_seed: Optional[int] = None
-    repetition_penalty: Optional[float] = None
-    return_options: Any = None
-    stop_sequences: Optional[List[str]] = None
-    temperature: Optional[float] = None
-    time_limit: Optional[int] = None
-    top_k: Optional[int] = None
-    top_p: Optional[float] = None
-    truncate_input_tokens: Optional[int] = None
-    typical_p: Optional[float] = None
-class GenericInferenceEngine(InferenceEngine, ArtifactFetcherMixin):
-    default: Optional[str] = None
-    def prepare_engine(self):
-        if "UNITXT_INFERENCE_ENGINE" in os.environ:
-            engine_reference = os.environ["UNITXT_INFERENCE_ENGINE"]
-        else:
-            assert self.default is not None, (
-                "GenericInferenceEngine could not be initialized"
-                '\nThis is since both the "UNITXT_INFERENCE_ENGINE" environmental variable is not set and no default engine was not inputted.'
-                "\nFor example, you can fix it by setting"
-                "\nexport UNITXT_INFERENCE_ENGINE=engines.ibm_gen_ai.llama_3_70b_instruct"
-                "\nto your ~/.bashrc"
-                "\nor passing a similar required engine in the default argument"
-            )
-            engine_reference = self.default
-        self.engine = self.get_artifact(engine_reference)
-    def get_engine_id(self):
-        return "generic_inference_engine"
-    def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
-        return_meta_data: bool = False,
-    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        return self.engine._infer(dataset)
-class OllamaInferenceEngine(
-    InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin
-):
-    label: str = "ollama"
-    _requirements_list = {
-        "ollama": "Install ollama package using 'pip install --upgrade ollama"
-    }
-    data_classification_policy = ["public", "proprietary"]
-    def get_engine_id(self):
-        return get_model_and_label_id(self.model, self.label)
-    def prepare_engine(self):
-        pass
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        import ollama
-        args = self.to_dict([StandardAPIParamsMixin])
-        results = []
-        for instance in dataset:
-            messages = self.to_messages(instance)
-            response = ollama.chat(
-                model=self.model,
-                messages=messages,
-                **args,
-            )
-            results.append(response)
-        return [element["message"]["content"] for element in results]
-class OptionSelectingByLogProbsInferenceEngine:
-    """OptionSelectingByLogProbsInferenceEngine inference engine is used to select an option based on the logprobs of an options list conditioned by a prompt.
-    The inference engines that inherit from this class must implement `get_token_count` and `get_options_log_probs`.
-    """
-    @abc.abstractmethod
-    def get_token_count(self, dataset):
-        """Get the token count of the source key of each dict of the dataset. Add to each instance in the data a "token_count" field.
-        Args:
-            dataset (List[Dict[str, Any]]): A list of dictionaries, each representing a data instance.
-        Returns:
-            List[int]: The token count of the texts
-        """
-    @abc.abstractmethod
-    def get_options_log_probs(self, dataset):
-        """Get the token logprobs of the options of the key task_data.options of each dict of the dataset.
-        Add to each instance in the data a "options_log_prob" field, which is a dict with str as key and a list of {text: str, logprob:float}.
-        Args:
-            dataset (List[Dict[str, Any]]): A list of dictionaries, each representing a data instance.
-        Returns:
-            List[int]: The token count of the texts
         """
     def select(self, dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -552,12 +1213,14 @@ class IbmGenAiInferenceEngine(
     }
     data_classification_policy = ["public", "proprietary"]
     parameters: Optional[IbmGenAiInferenceEngineParams] = None
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, self.label)
-    def prepare_engine(self):
-        from genai import Client, Credentials
         api_key_env_var_name = "GENAI_KEY"
         api_key = os.environ.get(api_key_env_var_name)
@@ -566,9 +1229,22 @@ class IbmGenAiInferenceEngine(
             f"Error while trying to run IbmGenAiInferenceEngine."
             f" Please set the environment param '{api_key_env_var_name}'."
         )
-        credentials = Credentials(api_key=api_key)
         self.client = Client(credentials=credentials)
         self._set_inference_parameters()
     def _infer(
@@ -576,22 +1252,26 @@ class IbmGenAiInferenceEngine(
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        from genai.schema import TextGenerationParameters
         genai_params = TextGenerationParameters(
             **self.to_dict([IbmGenAiInferenceEngineParamsMixin])
         )
-        results = []
         responses = self.client.text.generation.create(
             model_id=self.model_name,
             inputs=[instance["source"] for instance in dataset],
             parameters=genai_params,
         )
         for response in responses:
-            generated_text = response.results[0].generated_text
             result = self.get_return_object(
-                generated_text, response.results[0], return_meta_data
             )
             results.append(result)
         return results
@@ -601,7 +1281,9 @@ class IbmGenAiInferenceEngine(
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
-        from genai.schema import TextGenerationParameters
         logprobs_return_options = {
             "generated_tokens": True,
@@ -620,11 +1302,12 @@ class IbmGenAiInferenceEngine(
             model_id=self.model_name,
             inputs=[instance["source"] for instance in dataset],
             parameters=genai_params,
         )
         predict_results = []
         for prediction in predictions:
-            result = prediction.results[0]
             assert isinstance(
                 result.generated_tokens, list
             ), "result.generated_tokens should be a list"
@@ -651,9 +1334,22 @@ class IbmGenAiInferenceEngine(
                 output_tokens=result.generated_token_count,
                 model_name=self.model_name,
                 inference_type=self.label,
             )
         return predict_result
     def get_token_count(self, dataset):
         texts = [instance["source"] for instance in dataset]
         token_counts = list(
@@ -973,6 +1669,10 @@ class VLLMRemoteInferenceEngine(OpenAiInferenceEngine):
         return OpenAI(api_key=api_key, base_url=api_url)
 class WMLInferenceEngineParamsMixin(Artifact):
     decoding_method: Optional[Literal["greedy", "sample"]] = None
     length_penalty: Optional[Dict[str, Union[int, float]]] = None
@@ -1008,78 +1708,87 @@ class WMLInferenceEngineParams(Artifact):
     return_options: Optional[Dict[str, bool]] = None
-class WMLInferenceEngine(
     InferenceEngine,
-    WMLInferenceEngineParamsMixin,
     PackageRequirementsMixin,
     LogProbInferenceEngine,
     OptionSelectingByLogProbsInferenceEngine,
 ):
-    """Runs inference using ibm-watsonx-ai.
     Attributes:
         credentials (Dict[str, str], optional): By default, it is created by a class
             instance which tries to retrieve proper environment variables
-            ("WML_URL", "WML_PROJECT_ID", "WML_APIKEY"). However, a dictionary with
-            the following keys: "url", "apikey", "project_id" can be directly provided
-            instead.
         model_name (str, optional): ID of a model to be used for inference. Mutually
             exclusive with 'deployment_id'.
         deployment_id (str, optional): Deployment ID of a tuned model to be used for
             inference. Mutually exclusive with 'model_name'.
-        parameters (WMLInferenceEngineParams, optional): Instance of WMLInferenceEngineParams
-            which defines inference parameters and their values. Deprecated attribute, please
-            pass respective parameters directly to the WMLInferenceEngine class instead.
-        concurrency_limit (int): number of requests that will be sent in parallel, max is 10.
-    Examples:
-        from .api import load_dataset
-        wml_credentials = {
-            "url": "some_url", "project_id": "some_id", "api_key": "some_key"
-        }
-        model_name = "google/flan-t5-xxl"
-        wml_inference = WMLInferenceEngine(
-            credentials=wml_credentials,
-            model_name=model_name,
-            data_classification_policy=["public"],
-            top_p=0.5,
-            random_seed=123,
-        )
-        dataset = load_dataset(
-            dataset_query="card=cards.argument_topic,template_card_index=0,loader_limit=5"
-        )
-        results = wml_inference.infer(dataset["test"])
     """
-    credentials: Optional[Dict[Literal["url", "apikey", "project_id"], str]] = None
     model_name: Optional[str] = None
     deployment_id: Optional[str] = None
     label: str = "wml"
     _requirements_list = {
-        "ibm-watsonx-ai==1.1.14": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
         "It is advised to have Python version >=3.10 installed, as at lower version this package "
         "may cause conflicts with other installed packages."
     }
     data_classification_policy = ["public", "proprietary"]
-    parameters: Optional[WMLInferenceEngineParams] = None
-    concurrency_limit: int = 10
     _client: Any = InternalField(default=None, name="WML client")
     def get_engine_id(self):
-        return get_model_and_label_id(self.model_name, self.label)
     def verify(self):
         super().verify()
-        if self.credentials is not None:
-            for key in self.credentials:
-                if key not in ["url", "apikey", "project_id", "space_id"]:
-                    raise ValueError(
-                        f'Illegal credential key: {key}, use only ["url", "apikey", "project_id", "space_id"]'
-                    )
         assert (
             self.model_name
             or self.deployment_id
@@ -1095,166 +1804,186 @@ class WMLInferenceEngine(
                     data["credentials"][key] = value
         return data
     @staticmethod
-    def _read_wml_credentials_from_env() -> (
-        Dict[Literal["url", "apikey", "project_id", "space_id"], str]
-    ):
-        credentials = {}
-        project_or_deployment_var_name = (
-            "WML_SPACE_ID" if "WML_SPACE_ID" in os.environ else "WML_PROJECT_ID"
         )
-        for env_var_name in ["WML_URL", project_or_deployment_var_name, "WML_APIKEY"]:
-            env_var = os.environ.get(env_var_name)
-            assert env_var, (
-                f"Error while trying to run 'WMLInferenceEngine'. "
-                f"Please set the env variable: '{env_var_name}', or "
-                f"directly provide an instance of ibm-watsonx-ai 'Credentials' "
-                f"to the engine."
             )
-            name = env_var_name.lower().replace("wml_", "")
-            credentials[name] = env_var
         return credentials
-    def _initialize_wml_client(self):
-        from ibm_watsonx_ai.client import APIClient
-        if self.credentials is None:
-            self.credentials = self._read_wml_credentials_from_env()
-        client = APIClient(credentials=self.credentials)
-        if "space_id" in self.credentials:
-            client.set.default_space(self.credentials["space_id"])
-        else:
-            client.set.default_project(self.credentials["project_id"])
-        return client
     def prepare_engine(self):
         self._client = self._initialize_wml_client()
         self._set_inference_parameters()
-    def _load_model_and_params(self):
-        from ibm_watsonx_ai.foundation_models import ModelInference
-        model = ModelInference(
             model_id=self.model_name,
             deployment_id=self.deployment_id,
             api_client=self._client,
         )
-        params = self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False)
-        return model, params
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        self.verify_not_chat_api(dataset)
-        model, params = self._load_model_and_params()
-        result = []
-        for source in dataset["source"]:
-            instance_result = model.generate(
-                prompt=source,
-                params=self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False),
-            )
-            prediction = instance_result["results"][0]["generated_text"]
-            instance_final_results = self.get_return_object(
-                prediction, instance_result, return_meta_data
-            )
-            result.append(instance_final_results)
-        return result
     def _infer_log_probs(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
-        self.verify_not_chat_api(dataset)
-        model, params = self._load_model_and_params()
-        user_return_options = params.pop("return_options", {})
-        # currently this is the only configuration that returns generated logprobs and behaves as expected
-        logprobs_return_options = {
-            "input_tokens": True,
-            "generated_tokens": True,
-            "token_logprobs": True,
-            "top_n_tokens": user_return_options.get("top_n_tokens", 5),
-        }
-        for key, value in logprobs_return_options.items():
-            if key in user_return_options and user_return_options[key] != value:
-                raise ValueError(
-                    f"'{key}={user_return_options[key]}' is not supported for the 'infer_log_probs' "
-                    f"method of {self.__class__.__name__}. For obtaining the logprobs of generated tokens "
-                    f"please use '{key}={value}'."
-                )
-        params = {
-            **params,
-            "return_options": logprobs_return_options,
-        }
-        results = model.generate(
-            prompt=[instance["source"] for instance in dataset],
-            params=params,
         )
-        final_results = []
-        for result in results:
-            generated_tokens = result["results"][0]["generated_tokens"]
-            final_results.append(
-                self.get_return_object(generated_tokens, result, return_meta_data)
-            )
-        return final_results
-    def get_return_object(self, predict_result, result, return_meta_data):
-        if return_meta_data:
-            return TextGenerationInferenceOutput(
-                prediction=predict_result,
-                input_tokens=result["results"][0]["input_token_count"],
-                output_tokens=result["results"][0]["generated_token_count"],
-                model_name=self.model_name,
-                inference_type=self.label,
-            )
-        return predict_result
     def get_token_count(self, dataset):
-        from ibm_watsonx_ai.foundation_models import ModelInference
         texts = [instance["source"] for instance in dataset]
-        model = ModelInference(
-            model_id=self.model_name,
-            deployment_id=self.deployment_id,
-            api_client=self._client,
-        )
         for i in trange(len(texts), desc="Tokenizing"):
-            response = model.tokenize(prompt=texts[i], return_tokens=True)["result"]
             dataset[i]["token_count"] = response["token_count"]
         return dataset
     def get_options_log_probs(self, dataset):
         """Add to each instance in the data a "options_log_prob" field, which is a dict with str as key and a list of {text: str, logprob:float}."""
-        from ibm_watsonx_ai.foundation_models import ModelInference
-        model = ModelInference(
-            model_id=self.model_name,
-            deployment_id=self.deployment_id,
-            api_client=self._client,
-        )
         texts = [x["source"] for x in dataset]
         responses = list(
             tqdm(
-                model.generate(
                     prompt=texts,
                     params={
                         "decoding_method": "greedy",
@@ -1286,110 +2015,335 @@ class WMLInferenceEngine(
         return dataset
-def get_images_without_text(instance):
-    return extract_images(instance["source"], instance)
-def get_text_without_images(instance, image_token="<image>"):
-    regex = r"<" + f"{constants.image_tag}" + r'\s+src=["\'](.*?)["\']\s*/?>'
-    return re.sub(regex, image_token, instance["source"])
-class HFLlavaInferenceEngine(InferenceEngine, LazyLoadMixin):
-    model_name: str
-    max_new_tokens: int
-    lazy_load = True
-    image_token = "<image>"
-    _requirements_list = {
-        "transformers": "Install huggingface package using 'pip install --upgrade transformers",
-        "torch": "Install torch, go on PyTorch website for mode details.",
-        "accelerate": "pip install accelerate",
-    }
-    def get_engine_id(self):
-        return get_model_and_label_id(self.model_name, "hf_lava")
-    def _prepare_engine(self):
-        import torch
-        from transformers import AutoProcessor, LlavaForConditionalGeneration
-        self.device = torch.device(
-            "mps"
-            if torch.backends.mps.is_available()
-            else 0
-            if torch.cuda.is_available()
-            else "cpu"
         )
-        self.model = LlavaForConditionalGeneration.from_pretrained(
-            self.model_name,
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to(self.device)
-        self.processor = AutoProcessor.from_pretrained(self.model_name)
-    def prepare_engine(self):
-        if not self.lazy_load:
-            self._prepare_engine()
-    def _is_loaded(self):
-        return hasattr(self, "model") and self.model is not None
-    def _get_input(self, instance):
-        assert isinstance(instance["source"], list), "Must use format=formats.chat_api"
-        images = []
-        conversation = []
-        for turn in instance["source"]:
-            if isinstance(turn["content"], list):
-                for content in turn["content"]:
-                    if content["type"] == "image_url":
-                        content["type"] = "image"
-                        image_url = content.pop("image_url")["url"]
-                        image = data_url_to_image(image_url)
-                        images.append(image)
-            conversation.append(turn)
-        return conversation, images
-    def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
-        return_meta_data: bool = False,
-    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        if not self._is_loaded():
-            self._prepare_engine()
-        import torch
-        results = []
-        for instance in tqdm(dataset):
-            conversation, images = self._get_input(instance)
-            if len(images) == 1:
-                images = images[0]
-            text = self.processor.apply_chat_template(
-                conversation, add_generation_prompt=True
-            )
-            inputs = self.processor(images=images, text=text, return_tensors="pt").to(
-                self.device, torch.float16
             )
-            input_len = len(inputs["input_ids"][0])
-            output = self.model.generate(
-                **inputs,
-                max_new_tokens=self.max_new_tokens,
-                do_sample=False,
-                pad_token_id=self.processor.tokenizer.eos_token_id,
             )
-            result = self.processor.decode(
-                output[0][input_len:], skip_special_tokens=True
             )
-            results.append(result)
-        return results
 class LMMSEvalBaseInferenceEngine(
@@ -1400,7 +2354,9 @@ class LMMSEvalBaseInferenceEngine(
     batch_size: int = 1
     image_token = "<image>"
-    _requirements_list = ["lmms-eval==0.2.4"]
     def prepare_engine(self):
         if not self.lazy_load:
@@ -1447,7 +2403,6 @@ class LMMSEvalInferenceEngine(LMMSEvalBaseInferenceEngine):
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        self.verify_not_chat_api(dataset)
         if not self._is_loaded():
             self._prepare_engine()

 import time
 import uuid
 from collections import Counter
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 from datasets import DatasetDict
 from tqdm import tqdm, trange
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
 from .error_utils import UnitxtError
+from .image_operators import EncodeImageToString, data_url_to_image, extract_images
 from .logging_utils import get_logger
 from .operator import PackageRequirementsMixin
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants, get_settings
+from .type_utils import isoftype
 constants = get_constants()
 settings = get_settings()
     input_tokens (int) : number of input tokens to the model.
     output_tokens (int) : number of output tokens to the model.
+    stop_reason (str): stop reason for text generation, for example "eos" (end of string).
+    seed (int): seed used by the model during generation.
+    input_text (str): input to the model.
     model_name (str): the model_name as kept in the InferenceEngine.
     inference_type (str): The label stating the type of the InferenceEngine.
     """
     prediction: Union[str, List[Dict[str, Any]]]
     input_tokens: Optional[int] = None
     output_tokens: Optional[int] = None
+    stop_reason: Optional[str] = None
+    seed: Optional[int] = None
+    input_text: Optional[str] = None
     model_name: Optional[str] = None
     inference_type: Optional[str] = None
                 if param_inst_val is None:
                     setattr(self, param, param_dict_val)
+    def get_model_details(self) -> Dict:
+        """Might not be possible to implement for all inference engines. Returns an empty dict by default."""
+        return {}
     def verify_not_chat_api(self, dataset):
         if isinstance(dataset[0]["source"], list):
             raise NotImplementedError(
         pass
+class HFGenerationParamsMixin(Artifact):
     max_new_tokens: int
+    do_sample: bool = False
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
     top_k: Optional[int] = None
+    num_beams: Optional[int] = None
+    repetition_penalty: Optional[float] = None
+    pad_token_id: Optional[int] = None
+    eos_token_id: Optional[int] = None
+class HFInferenceEngineBase(
+    InferenceEngine,
+    LogProbInferenceEngine,
+    PackageRequirementsMixin,
+    LazyLoadMixin,
+    HFGenerationParamsMixin,
+):
+    model_name: str
+    label: str
+    n_top_tokens: int = 5
+    device: Any = None
+    device_map: Any = None
+    use_fast_tokenizer: bool = True
+    low_cpu_mem_usage: bool = True
+    torch_dtype: str = "torch.float16"
+    model: Any = InternalField(default=None, name="Inference object")
+    processor: Any = InternalField(default=None, name="Input processor (tokenizer)")
     _requirements_list = {
+        "transformers": "Install huggingface package using 'pip install --upgrade transformers",
+        "torch": "Install torch, go on PyTorch website for mode details.",
+        "accelerate": "pip install accelerate",
     }
+    def _is_loaded(self):
+        return hasattr(self, "model") and self.model is not None
+    def _set_inference_device(self):
+        if self.device is not None and self.device_map is not None:
+            raise ValueError(
+                f"You must specify either 'device' or 'device_map', however both "
+                f"were given: 'device={self.device}', 'device_map={self.device_map}'."
+            )
+        if self.device is None and self.device_map is None:
+            import torch
+            self.device = torch.device(
+                "mps"
+                if torch.backends.mps.is_available()
+                else 0
+                if torch.cuda.is_available()
+                else "cpu"
+            )
+    @abc.abstractmethod
+    def _init_processor(self):
+        raise NotImplementedError
+    @abc.abstractmethod
+    def _init_model(self):
+        raise NotImplementedError
+    def _get_torch_dtype(self):
+        import torch
+        if not isinstance(self.torch_dtype, str) or not self.torch_dtype.startswith(
+            "torch."
+        ):
+            raise ValueError(
+                f"'torch_dtype' must be a string representing torch data "
+                f"type used for inference. The name should be an absolute "
+                f"import, for example: 'torch.float16'. However, "
+                f"'{self.torch_dtype}' was given instead."
+            )
+        try:
+            dtype = eval(self.torch_dtype)
+        except (AttributeError, TypeError) as e:
+            raise ValueError(
+                f"Incorrect value of 'torch_dtype' was given: '{self.torch_dtype}'."
+            ) from e
+        if not isinstance(dtype, torch.dtype):
+            raise ValueError(
+                f"'torch_dtype' must be an instance of 'torch.dtype', however, "
+                f"'{dtype}' is an instance of '{type(dtype)}'."
+            )
+        return dtype
+    def _prepare_engine(self):
+        self._set_inference_device()
+        self._init_processor()
+        self._init_model()
     def prepare_engine(self):
         if not self.lazy_load:
+            self._prepare_engine()
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
+    def decode_tokens(self, tokens: Sequence, inp_length: int) -> List[str]:
+        return [
+            self.processor.decode(token, skip_special_tokens=True)
+            for token in tokens[inp_length:]
+        ]
+    @staticmethod
+    def create_string_from_tokens(string_tokens: List[str]) -> str:
+        return "".join(token for token in string_tokens)
+    def make_predictions(self, prepared_inputs: Mapping) -> Mapping:
+        return self.model.generate(
+            **prepared_inputs,
+            **self.to_dict([HFGenerationParamsMixin], keep_empty=False),
+            output_scores=True,
+            return_dict_in_generate=True,
+        )
+    def compute_transition_scores(
+        self, sequences: Sequence, scores: Sequence, beam_indices: Optional[int]
+    ) -> Sequence:
+        # Some models may not support computing scores in this form by default, so a possible
+        # child class should have its own implementation of this method if necessary.
+        return self.model.compute_transition_scores(
+            sequences,
+            scores,
+            normalize_logits=True,
+            beam_indices=beam_indices,
+        )
+    def get_logprobs(
+        self, predictions: Mapping, string_tokens: List[List[str]]
+    ) -> List[List[Dict[str, Any]]]:
+        beam_indices = (
+            predictions.beam_indices
+            if self.num_beams is not None and self.num_beams > 1
+            else None
+        )
+        transition_scores = self.compute_transition_scores(
+            sequences=predictions.sequences,
+            scores=predictions.scores,
+            beam_indices=beam_indices,
+        )
+        logprobs: List[List[Dict[str, Any]]] = []
+        for sample_no, sample_scores in enumerate(transition_scores.detach().cpu()):
+            sample_logprobs: List[Dict[str, Any]] = []
+            for n, score in enumerate(sample_scores):
+                sample_logprobs.append(
+                    {
+                        "text": string_tokens[sample_no][n],
+                        "logprob": float(score.cpu()),
+                        "top_tokens": [
+                            {
+                                "text": self.processor.decode(idx),
+                                "logprob": float(
+                                    predictions.scores[n][sample_no][idx].cpu()
+                                ),
+                            }
+                            for idx in predictions.scores[n][sample_no].argsort(
+                                dim=0, descending=True
+                            )[: self.n_top_tokens]
+                        ],
+                    }
+                )
+            logprobs.append(sample_logprobs)
+        return logprobs
+    @abc.abstractmethod
+    def prepare_inputs(self, data: Iterable) -> Mapping:
+        raise NotImplementedError
+    def get_return_object(
+        self,
+        output: Union[str, List[Dict[str, Any]]],
+        output_tokens: Optional[int],
+        inp: Optional[str],
+        inp_tokens: Optional[int],
+        return_meta_data: bool,
+    ) -> Union[str, List[Dict[str, Any]], TextGenerationInferenceOutput]:
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=output,
+                output_tokens=output_tokens if output_tokens is not None else None,
+                input_text=inp,
+                input_tokens=inp_tokens if inp_tokens is not None else None,
+                model_name=self.model_name,
+                inference_type=self.label,
+            )
+        return output
+    def infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        if not self._is_loaded():
+            self._prepare_engine()
+        return super().infer(dataset, return_meta_data)
+    @abc.abstractmethod
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        raise NotImplementedError
+    def infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        if not self._is_loaded():
+            self._prepare_engine()
+        return super().infer_log_probs(dataset, return_meta_data)
+    @abc.abstractmethod
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        raise NotImplementedError
+class HFAutoModelInferenceEngine(HFInferenceEngineBase):
+    label: str = "hf_auto_model"
+    def _init_processor(self):
+        from transformers import AutoTokenizer
+        self.processor = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=self.model_name,
+            use_fast=self.use_fast_tokenizer,
+            padding=True,
+            truncation=True,
+        )
+    def _init_model(self):
+        from transformers import (
+            AutoConfig,
+            AutoModelForCausalLM,
+            AutoModelForSeq2SeqLM,
+        )
+        model_class = (
+            AutoModelForSeq2SeqLM
+            if AutoConfig.from_pretrained(self.model_name).is_encoder_decoder
+            else AutoModelForCausalLM
+        )
+        self.model = model_class.from_pretrained(
+            pretrained_model_name_or_path=self.model_name,
+            trust_remote_code=True,
+            device_map=self.device_map,
+            torch_dtype=self._get_torch_dtype(),
+        )
+        if self.device_map is None:
+            self.model.to(self.device)
+    def prepare_inputs(self, data: Iterable) -> Mapping:
+        return self.processor(
+            data,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        ).to(self.device or self.device_map)
+    def _infer_fn(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool,
+        return_logprobs: bool,
+    ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        tokenized_inputs = self.prepare_inputs(
+            [instance["source"] for instance in dataset]
+        )
+        input_length = (
+            1
+            if self.model.config.is_encoder_decoder
+            else tokenized_inputs.input_ids.shape[1]
+        )
+        predictions = self.make_predictions(tokenized_inputs)
+        sequences = predictions.sequences
+        string_tokens = [
+            self.decode_tokens(sequence, input_length) for sequence in sequences
+        ]
+        final_outputs = (
+            self.get_logprobs(predictions, string_tokens)
+            if return_logprobs
+            else [self.create_string_from_tokens(strings) for strings in string_tokens]
+        )
+        return [
+            self.get_return_object(
+                output=final_outputs[i],
+                output_tokens=len(string_tokens[i]),
+                inp=dataset[i]["source"],
+                inp_tokens=len(tokenized_inputs.encodings[i].tokens)
+                if tokenized_inputs.encodings is not None
+                else None,
+                return_meta_data=return_meta_data,
+            )
+            for i in range(len(sequences))
+        ]
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        self.verify_not_chat_api(dataset)
+        return self._infer_fn(dataset, return_meta_data, False)
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        self.verify_not_chat_api(dataset)
+        return self._infer_fn(dataset, return_meta_data, True)
+class HFLlavaInferenceEngine(HFInferenceEngineBase):
+    lazy_load: bool = True
+    label: str = "hf_lava"
+    image_token: str = "<image>"
+    def compute_transition_scores(
+        self, sequences: Sequence, scores: Sequence, beam_indices: Optional[int]
+    ) -> Sequence:
+        if not hasattr(self.model.config, "vocab_size"):
+            self.model.config.vocab_size = self.model.vocab_size
+        return super().compute_transition_scores(sequences, scores, beam_indices)
+    def _init_processor(self):
+        from transformers import AutoProcessor
+        self.processor = AutoProcessor.from_pretrained(self.model_name)
+        if not self.pad_token_id and hasattr(self.processor, "eos_token_id"):
+            self.pad_token_id = self.processor.eos_token_id
+    def _init_model(self):
+        from transformers import LlavaForConditionalGeneration
+        self.model = LlavaForConditionalGeneration.from_pretrained(
+            self.model_name,
+            torch_dtype=self._get_torch_dtype(),
+            low_cpu_mem_usage=self.low_cpu_mem_usage,
+            device_map=self.device_map,
+        )
+        if self.device_map is None:
+            self.model.to(self.device)
+    @staticmethod
+    def _get_input(instance):
+        assert isinstance(instance["source"], list), "Must use format=formats.chat_api"
+        images = []
+        conversation = []
+        for turn in instance["source"]:
+            if isinstance(turn["content"], list):
+                for content in turn["content"]:
+                    if content["type"] == "image_url":
+                        content["type"] = "image"
+                        image_url = content.pop("image_url")["url"]
+                        image = data_url_to_image(image_url)
+                        images.append(image)
+            conversation.append(turn)
+        return conversation, images
+    def prepare_inputs(self, data: Iterable) -> Mapping:
+        conversation, images = self._get_input(data)
+        if len(images) == 1:
+            images = images[0]
+        text = self.processor.apply_chat_template(
+            conversation, add_generation_prompt=True
+        )
+        inputs: Mapping = self.processor(
+            images=images, text=text, return_tensors="pt"
+        ).to(self.device or self.device_map, self._get_torch_dtype())
+        return inputs
+    def _infer_fn(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool,
+        return_logprobs: bool,
+    ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        results = []
+        for instance in tqdm(dataset):
+            processed_inputs = self.prepare_inputs(instance)
+            input_len = len(processed_inputs["input_ids"][0])
+            predictions = self.make_predictions(processed_inputs)
+            string_tokens = self.decode_tokens(predictions.sequences[0], input_len)
+            final_outputs = (
+                self.get_logprobs(predictions, [string_tokens])[0]
+                if return_logprobs
+                else self.create_string_from_tokens(string_tokens)
+            )
+            results.append(
+                self.get_return_object(
+                    output=final_outputs,
+                    output_tokens=len(string_tokens),
+                    inp=instance["source"],
+                    inp_tokens=None,
+                    return_meta_data=return_meta_data,
+                )
+            )
+        return results
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return self._infer_fn(dataset, return_meta_data, False)
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        return self._infer_fn(dataset, return_meta_data, True)
+class HFPeftInferenceEngine(HFAutoModelInferenceEngine):
+    label: str = "hf_peft_auto_model"
+    peft_config: Any = InternalField(
+        default=None,
+        name="PEFT config read from the directory or the Hub repository "
+        "id specified in the 'model_name'.",
+    )
+    _requirements_list = {
+        "transformers": "Install huggingface package using 'pip install --upgrade transformers",
+        "torch": "Install torch, go on PyTorch website for mode details.",
+        "accelerate": "pip install accelerate",
+        "peft": "Install 'peft' package using: 'pip install peft'.",
+    }
+    def _prepare_engine(self):
+        self._read_peft_config()
+        super()._prepare_engine()
+    def _read_peft_config(self):
+        from peft import PeftConfig
+        try:
+            config = PeftConfig.from_pretrained(self.model_name)
+            assert isinstance(config.base_model_name_or_path, str)
+            self.peft_config = config
+        except ValueError as e:
+            if "Can't find" in str(e):
+                raise ValueError(
+                    f"Specified model '{self.model_name}' is not the PEFT model. "
+                    f"Use a regular instance of the `HFAutoModelInferenceEngine` "
+                    f"instead."
+                ) from e
+            raise e
+    def _init_processor(self):
+        from transformers import AutoTokenizer
+        self.processor = AutoTokenizer.from_pretrained(
+            self.peft_config.base_model_name_or_path
+        )
+    def _init_model(self):
+        from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
+        from transformers import AutoConfig
+        model_class = (
+            AutoPeftModelForSeq2SeqLM
+            if AutoConfig.from_pretrained(self.model_name).is_encoder_decoder
+            else AutoPeftModelForCausalLM
+        )
+        self.model = model_class.from_pretrained(
+            pretrained_model_name_or_path=self.peft_config.base_model_name_or_path,
+            trust_remote_code=True,
+            device_map=self.device_map,
+            low_cpu_mem_usage=self.low_cpu_mem_usage,
+            torch_dtype=self._get_torch_dtype(),
+        )
+        if self.device_map is None:
+            self.model.to(self.device)
+@deprecation(
+    version="2.0.0", msg=" Use non-pipeline-based 'HFInferenceEngine' instead."
+)
+class HFPipelineBasedInferenceEngine(
+    InferenceEngine, PackageRequirementsMixin, LazyLoadMixin, HFGenerationParamsMixin
+):
+    model_name: str
+    label: str = "hf_pipeline_inference_engine"
+    use_fast_tokenizer: bool = True
+    use_fp16: bool = True
+    load_in_8bit: bool = False
+    task: Optional[str] = None
+    device: Any = None
+    device_map: Any = None
+    pipe: Any = InternalField(default=None)
+    _requirements_list = {
+        "transformers": "Install huggingface package using 'pip install --upgrade transformers",
+        "torch": "Install torch, go on PyTorch website for mode details.",
+        "accelerate": "pip install accelerate",
+    }
+    def _is_loaded(self):
+        return hasattr(self, "model") and self.model is not None
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, "hf_pipeline")
+    def _define_task(self):
+        from transformers import AutoConfig
+        self.task = (
+            "text2text-generation"
+            if AutoConfig.from_pretrained(
+                self.model_name, trust_remote_code=True
+            ).is_encoder_decoder
+            else "text-generation"
+        )
+    def _get_model_args(self) -> Dict[str, Any]:
+        import torch
+        from transformers import BitsAndBytesConfig
+        args = {}
+        if self.load_in_8bit:
+            quantization_config = BitsAndBytesConfig(load_in_8bit=self.load_in_8bit)
+            args["quantization_config"] = quantization_config
+        elif self.use_fp16:
+            if self.device == torch.device("mps"):
+                args["torch_dtype"] = torch.float16
+            else:
+                args["torch_dtype"] = torch.bfloat16
+        # We do this, because in some cases, using device:auto will offload some weights to the cpu
+        # (even though the model might *just* fit to a single gpu), even if there is a gpu available, and this will
+        # cause an error because the data is always on the gpu
+        if torch.cuda.device_count() > 1:
+            assert self.device == torch.device(0)
+            args["device_map"] = "auto"
+        else:
+            if not self.load_in_8bit:
+                args["device"] = self.device
+        if self.task == "text-generation":
+            args["return_full_text"] = False
+        return args
+    def _create_pipeline(self, model_args: Dict[str, Any]):
+        from transformers import pipeline
+        self.model = pipeline(
+            model=self.model_name,
+            task=self.task,
+            use_fast=self.use_fast_tokenizer,
+            trust_remote_code=True,
+            **model_args,
+            **self.to_dict(
+                [HFGenerationParamsMixin],
+                keep_empty=False,
+            ),
+        )
+    def _set_inference_device(self):
+        if self.device is not None and self.device_map is not None:
+            raise ValueError(
+                f"You must specify either 'device' or 'device_map', however both "
+                f"were given: 'device={self.device}', 'device_map={self.device_map}'."
+            )
+        if self.device is None and self.device_map is None:
+            import torch
+            self.device = torch.device(
+                "mps"
+                if torch.backends.mps.is_available()
+                else 0
+                if torch.cuda.is_available()
+                else "cpu"
+            )
+    def _prepare_engine(self):
+        self._set_inference_device()
+        if self.task is None:
+            self._define_task()
+        model_args = self._get_model_args()
+        self._create_pipeline(model_args)
+    def prepare_engine(self):
+        if not self.lazy_load:
+            self._prepare_engine()
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        if not self._is_loaded():
+            self._prepare_engine()
+        outputs = self.model([instance["source"] for instance in dataset])
+        return [
+            self.get_return_object(output[0], instance["source"], return_meta_data)
+            if isinstance(output, list)
+            else self.get_return_object(output, instance["source"], return_meta_data)
+            for output, instance in zip(outputs, dataset)
+        ]
+    def get_return_object(self, output, inp, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=output["generated_text"],
+                model_name=self.model_name,
+                inference_type=self.label,
+                input_text=inp,
+            )
+        return output["generated_text"]
+def mock_logprobs_default_value_factory() -> List[Dict[str, Any]]:
+    return [
+        {
+            "logprob": -1,
+            "text": "[[10]]",
+            "top_tokens": [
+                {"logprob": -1, "text": "[[10]]"},
+            ],
+        }
+    ]
+class MockInferenceEngine(InferenceEngine, LogProbInferenceEngine):
+    model_name: str
+    default_inference_value: str = "[[10]]"
+    default_inference_value_logprob: List[Dict[str, Any]] = dataclasses.field(
+        default_factory=mock_logprobs_default_value_factory,
+    )
+    label: str = "mock_inference_engine"
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, "mock")
+    def prepare_engine(self):
+        return
+    def _mock_infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return [self.default_inference_value for _ in dataset]
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return [
+            self.get_return_object(
+                self.default_inference_value, instance, return_meta_data
+            )
+            for instance in dataset
+        ]
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        return [
+            self.get_return_object(
+                self.default_inference_value_logprob, instance, return_meta_data
+            )
+            for instance in dataset
+        ]
+    def get_return_object(self, predict_result, instance, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=predict_result,
+                input_tokens=len(instance["source"]),
+                output_tokens=len(predict_result),
+                model_name=self.model_name,
+                inference_type=self.label,
+                input_text=instance["source"],
+                seed=111,
+                stop_reason="",
+            )
+        return predict_result
+class MockModeMixin(Artifact):
+    mock_mode: bool = False
+class IbmGenAiInferenceEngineParamsMixin(Artifact):
+    beam_width: Optional[int] = None
+    decoding_method: Optional[Literal["greedy", "sample"]] = None
+    include_stop_sequence: Optional[bool] = None
+    length_penalty: Any = None
+    max_new_tokens: Optional[int] = None
+    min_new_tokens: Optional[int] = None
+    random_seed: Optional[int] = None
+    repetition_penalty: Optional[float] = None
+    return_options: Any = None
+    stop_sequences: Optional[List[str]] = None
+    temperature: Optional[float] = None
+    time_limit: Optional[int] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    truncate_input_tokens: Optional[int] = None
+    typical_p: Optional[float] = None
+@deprecation(version="2.0.0", alternative=IbmGenAiInferenceEngineParamsMixin)
+class IbmGenAiInferenceEngineParams(Artifact):
+    beam_width: Optional[int] = None
+    decoding_method: Optional[Literal["greedy", "sample"]] = None
+    include_stop_sequence: Optional[bool] = None
+    length_penalty: Any = None
+    max_new_tokens: Optional[int] = None
+    min_new_tokens: Optional[int] = None
+    random_seed: Optional[int] = None
+    repetition_penalty: Optional[float] = None
+    return_options: Any = None
+    stop_sequences: Optional[List[str]] = None
+    temperature: Optional[float] = None
+    time_limit: Optional[int] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    truncate_input_tokens: Optional[int] = None
+    typical_p: Optional[float] = None
+class GenericInferenceEngine(
+    InferenceEngine, ArtifactFetcherMixin, LogProbInferenceEngine
+):
+    default: Optional[str] = None
+    def prepare_engine(self):
+        if "UNITXT_INFERENCE_ENGINE" in os.environ:
+            engine_reference = os.environ["UNITXT_INFERENCE_ENGINE"]
+        else:
+            assert self.default is not None, (
+                "GenericInferenceEngine could not be initialized"
+                '\nThis is since both the "UNITXT_INFERENCE_ENGINE" environmental variable is not set and no default engine was not inputted.'
+                "\nFor example, you can fix it by setting"
+                "\nexport UNITXT_INFERENCE_ENGINE=engines.ibm_gen_ai.llama_3_70b_instruct"
+                "\nto your ~/.bashrc"
+                "\nor passing a similar required engine in the default argument"
+            )
+            engine_reference = self.default
+        self.engine = self.get_artifact(engine_reference)
+    def get_engine_id(self):
+        # If mock_inference_mode is set, no engine is prepared.
+        if hasattr(self, "engine"):
+            return f"generic_{self.engine.get_engine_id()}"
+        return "generic_inference_engine"
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return self.engine._infer(dataset)
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        if not isinstance(self.engine, LogProbInferenceEngine):
+            raise NotImplementedError(
+                f"Error in infer: inference engine used by the GenericInferenceEngine"
+                f"({self.engine.__class__.__name__}) does not support logprobs."
+            )
+        return self.engine._infer_log_probs(dataset)
+class OllamaInferenceEngine(
+    InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin
+):
+    label: str = "ollama"
+    _requirements_list = {
+        "ollama": "Install ollama package using 'pip install --upgrade ollama"
+    }
+    data_classification_policy = ["public", "proprietary"]
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model, self.label)
+    def prepare_engine(self):
+        pass
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        import ollama
+        args = self.to_dict([StandardAPIParamsMixin])
+        results = []
+        for instance in dataset:
+            messages = self.to_messages(instance)
+            response = ollama.chat(
+                model=self.model,
+                messages=messages,
+                **args,
+            )
+            results.append(response)
+        return [element["message"]["content"] for element in results]
+class OptionSelectingByLogProbsInferenceEngine:
+    """OptionSelectingByLogProbsInferenceEngine inference engine is used to select an option based on the logprobs of an options list conditioned by a prompt.
+    The inference engines that inherit from this class must implement `get_token_count` and `get_options_log_probs`.
+    """
+    @abc.abstractmethod
+    def get_token_count(self, dataset):
+        """Get the token count of the source key of each dict of the dataset. Add to each instance in the data a "token_count" field.
+        Args:
+            dataset (List[Dict[str, Any]]): A list of dictionaries, each representing a data instance.
+        Returns:
+            List[int]: The token count of the texts
+        """
+    @abc.abstractmethod
+    def get_options_log_probs(self, dataset):
+        """Get the token logprobs of the options of the key task_data.options of each dict of the dataset.
+        Add to each instance in the data a "options_log_prob" field, which is a dict with str as key and a list of {text: str, logprob:float}.
+        Args:
+            dataset (List[Dict[str, Any]]): A list of dictionaries, each representing a data instance.
+        Returns:
+            List[int]: The token count of the texts
         """
     def select(self, dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     }
     data_classification_policy = ["public", "proprietary"]
     parameters: Optional[IbmGenAiInferenceEngineParams] = None
+    rate_limit: int = 10
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, self.label)
+    @staticmethod
+    def _get_credentials():
+        from genai import Credentials
         api_key_env_var_name = "GENAI_KEY"
         api_key = os.environ.get(api_key_env_var_name)
             f"Error while trying to run IbmGenAiInferenceEngine."
             f" Please set the environment param '{api_key_env_var_name}'."
         )
+        return Credentials(api_key=api_key)
+    def prepare_engine(self):
+        self.check_missing_requirements()
+        from genai import Client
+        from genai.text.generation import CreateExecutionOptions
+        credentials = self._get_credentials()
         self.client = Client(credentials=credentials)
+        self.execution_options = CreateExecutionOptions(
+            concurrency_limit=self.rate_limit
+        )
         self._set_inference_parameters()
     def _infer(
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        from genai.schema import TextGenerationParameters, TextGenerationResult
+        self.verify_not_chat_api(dataset)
         genai_params = TextGenerationParameters(
             **self.to_dict([IbmGenAiInferenceEngineParamsMixin])
         )
         responses = self.client.text.generation.create(
             model_id=self.model_name,
             inputs=[instance["source"] for instance in dataset],
             parameters=genai_params,
+            execution_options=self.execution_options,
         )
+        results = []
         for response in responses:
+            generation_result: TextGenerationResult = response.results[0]
             result = self.get_return_object(
+                generation_result.generated_text, generation_result, return_meta_data
             )
             results.append(result)
         return results
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        from genai.schema import TextGenerationParameters, TextGenerationResult
+        self.verify_not_chat_api(dataset)
         logprobs_return_options = {
             "generated_tokens": True,
             model_id=self.model_name,
             inputs=[instance["source"] for instance in dataset],
             parameters=genai_params,
+            execution_options=self.execution_options,
         )
         predict_results = []
         for prediction in predictions:
+            result: TextGenerationResult = prediction.results[0]
             assert isinstance(
                 result.generated_tokens, list
             ), "result.generated_tokens should be a list"
                 output_tokens=result.generated_token_count,
                 model_name=self.model_name,
                 inference_type=self.label,
+                input_text=result.input_text,
+                seed=self.random_seed,
+                stop_reason=result.stop_reason,
             )
         return predict_result
+    def get_model_details(self) -> Dict:
+        from genai import ApiClient
+        from genai.model import ModelService
+        api_client = ApiClient(credentials=self._get_credentials())
+        model_info = (
+            ModelService(api_client=api_client).retrieve(id=self.model_name).result
+        )
+        return model_info.dict()
     def get_token_count(self, dataset):
         texts = [instance["source"] for instance in dataset]
         token_counts = list(
         return OpenAI(api_key=api_key, base_url=api_url)
+@deprecation(
+    version="2.0.0",
+    msg=" You can specify inference parameters directly when initializing an inference engine.",
+)
 class WMLInferenceEngineParamsMixin(Artifact):
     decoding_method: Optional[Literal["greedy", "sample"]] = None
     length_penalty: Optional[Dict[str, Union[int, float]]] = None
     return_options: Optional[Dict[str, bool]] = None
+class WMLGenerationParamsMixin(Artifact):
+    decoding_method: Optional[Literal["greedy", "sample"]] = None
+    length_penalty: Optional[Dict[str, Union[int, float]]] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    top_k: Optional[int] = None
+    random_seed: Optional[int] = None
+    repetition_penalty: Optional[float] = None
+    min_new_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
+    stop_sequences: Optional[List[str]] = None
+    time_limit: Optional[int] = None
+    truncate_input_tokens: Optional[int] = None
+    prompt_variables: Optional[Dict[str, Any]] = None
+    return_options: Optional[Dict[str, bool]] = None
+class WMLChatParamsMixin(Artifact):
+    frequency_penalty: Optional[float] = None
+    top_logprobs: Optional[int] = 5
+    presence_penalty: Optional[float] = None
+    response_format: Optional[Dict[str, Any]] = None
+    temperature: Optional[float] = None
+    max_tokens: Optional[int] = None
+    time_limit: Optional[int] = None
+    top_p: Optional[float] = None
+    n: Optional[int] = None
+CredentialsWML = Dict[
+    Literal["url", "username", "password", "apikey", "project_id", "space_id"], str
+]
+class WMLInferenceEngineBase(
     InferenceEngine,
     PackageRequirementsMixin,
     LogProbInferenceEngine,
     OptionSelectingByLogProbsInferenceEngine,
 ):
+    """Base for classes running inference using ibm-watsonx-ai.
     Attributes:
         credentials (Dict[str, str], optional): By default, it is created by a class
             instance which tries to retrieve proper environment variables
+            ("WML_URL", "WML_PROJECT_ID", "WML_SPACE_ID", "WML_APIKEY", "WML_USERNAME", "WML_PASSWORD").
+            However, a dictionary with the following keys: "url", "apikey", "project_id", "space_id",
+            "username", "password".
+            can be directly provided instead.
         model_name (str, optional): ID of a model to be used for inference. Mutually
             exclusive with 'deployment_id'.
         deployment_id (str, optional): Deployment ID of a tuned model to be used for
             inference. Mutually exclusive with 'model_name'.
+        parameters (Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin], optional):
+            Defines inference parameters and their values. Deprecated attribute, please pass respective
+            parameters directly to the respective class instead.
     """
+    credentials: Optional[CredentialsWML] = None
     model_name: Optional[str] = None
     deployment_id: Optional[str] = None
     label: str = "wml"
     _requirements_list = {
+        "ibm_watsonx_ai": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
         "It is advised to have Python version >=3.10 installed, as at lower version this package "
         "may cause conflicts with other installed packages."
     }
     data_classification_policy = ["public", "proprietary"]
+    parameters: Optional[
+        Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin]
+    ] = None
     _client: Any = InternalField(default=None, name="WML client")
+    _model: Any = InternalField(default=None, name="WML model")
     def get_engine_id(self):
+        return get_model_and_label_id(self.model_name or self.deployment_id, self.label)
     def verify(self):
         super().verify()
         assert (
             self.model_name
             or self.deployment_id
                     data["credentials"][key] = value
         return data
+    def _initialize_wml_client(self):
+        from ibm_watsonx_ai.client import APIClient
+        if self.credentials is None:
+            self.credentials = self._read_wml_credentials_from_env()
+        self._verify_wml_credentials(self.credentials)
+        client = APIClient(credentials=self.credentials)
+        if "space_id" in self.credentials:
+            client.set.default_space(self.credentials["space_id"])
+        else:
+            client.set.default_project(self.credentials["project_id"])
+        return client
     @staticmethod
+    def _read_wml_credentials_from_env() -> CredentialsWML:
+        credentials: CredentialsWML = {}
+        url = os.environ.get("WML_URL")
+        assert url, (
+            "Error while trying to run 'WMLInferenceEngine'. "
+            "Please set the env variable: 'WML_URL'"
         )
+        credentials["url"] = url
+        space_id = os.environ.get("WML_SPACE_ID")
+        project_id = os.environ.get("WML_PROJECT_ID")
+        if space_id and project_id:
+            get_logger().warning(
+                "Either 'WML_SPACE_ID' or 'WML_PROJECT_ID' need to be "
+                "specified, however, both were found. 'WMLInferenceEngine' "
+                "will use space by default. If it is not desired, then have "
+                "only one of those defined in the env."
+            )
+            credentials["space_id"] = space_id
+        elif project_id:
+            credentials["project_id"] = project_id
+        else:
+            raise AssertionError(
+                "Error while trying to run 'WMLInferenceEngine'. "
+                "Please set either 'WML_SPACE_ID' or 'WML_PROJECT_ID' env "
+                "variable."
+            )
+        apikey = os.environ.get("WML_APIKEY")
+        username = os.environ.get("WML_USERNAME")
+        password = os.environ.get("WML_PASSWORD")
+        if apikey and username and password:
+            get_logger().warning(
+                "Either 'WML_APIKEY' or both 'WML_USERNAME' and 'WML_PASSWORD' "
+                "need to be specified, however, all of them were found. "
+                "'WMLInferenceEngine' will use api key only by default. If it is not "
+                "desired, then have only one of those options defined in the env."
             )
+        if apikey:
+            credentials["apikey"] = apikey
+        elif username and password:
+            credentials["username"] = username
+            credentials["password"] = password
+        else:
+            raise AssertionError(
+                "Error while trying to run 'WMLInferenceEngine'. "
+                "Please set either 'WML_APIKEY' or both 'WML_USERNAME' and "
+                "'WML_PASSWORD' env variables."
+            )
         return credentials
+    @staticmethod
+    def _verify_wml_credentials(credentials: CredentialsWML) -> None:
+        assert isoftype(credentials, CredentialsWML), (
+            "WML credentials object must be a dictionary which may "
+            "contain only the following keys: "
+            "['url', 'apikey', 'username', 'password']."
+        )
+        assert credentials.get(
+            "url"
+        ), "'url' is a mandatory key for WML credentials dict."
+        assert "space_id" in credentials or "project_id" in credentials, (
+            "Either 'space_id' or 'project_id' must be provided "
+            "as keys for WML credentials dict."
+        )
+        assert "apikey" in credentials or (
+            "username" in credentials and "password" in credentials
+        ), (
+            "Either 'apikey' or both 'username' and 'password' must be provided "
+            "as keys for WML credentials dict."
+        )
     def prepare_engine(self):
+        self.check_missing_requirements()
         self._client = self._initialize_wml_client()
         self._set_inference_parameters()
+    def _load_model(self):
+        from ibm_watsonx_ai.foundation_models.inference import ModelInference
+        self._model = ModelInference(
             model_id=self.model_name,
             deployment_id=self.deployment_id,
             api_client=self._client,
         )
+    @abc.abstractmethod
+    def _send_requests(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_logprobs: bool,
+        return_meta_data: bool,
+    ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        raise NotImplementedError(
+            f"The class '{self.get_pretty_print_name()}' is an abstract class. "
+            f"Please used either 'WMLInferenceEngineGeneration' or "
+            f"'WMLInferenceEngineChat' instead, depending on your task."
+        )
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        if self._model is None:
+            self._load_model()
+        return self._send_requests(
+            dataset=dataset,
+            return_logprobs=False,
+            return_meta_data=return_meta_data,
+        )
     def _infer_log_probs(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        if self._model is None:
+            self._load_model()
+        return self._send_requests(
+            dataset=dataset,
+            return_logprobs=True,
+            return_meta_data=return_meta_data,
         )
+    @abc.abstractmethod
+    def get_return_object(self, predict_result, result, input_text, return_meta_data):
+        raise NotImplementedError
+    def get_model_details(self) -> Dict:
+        return self._model.get_details()
     def get_token_count(self, dataset):
+        if self._model is None:
+            self._load_model()
         texts = [instance["source"] for instance in dataset]
         for i in trange(len(texts), desc="Tokenizing"):
+            response = self._model.tokenize(prompt=texts[i], return_tokens=True)[
+                "result"
+            ]
             dataset[i]["token_count"] = response["token_count"]
         return dataset
     def get_options_log_probs(self, dataset):
         """Add to each instance in the data a "options_log_prob" field, which is a dict with str as key and a list of {text: str, logprob:float}."""
+        if self._model is None:
+            self._load_model()
         texts = [x["source"] for x in dataset]
         responses = list(
             tqdm(
+                self._model.generate(
                     prompt=texts,
                     params={
                         "decoding_method": "greedy",
         return dataset
+class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMixin):
+    """Generates text for textual inputs.
+    If you want to include images in your input, please use 'WMLInferenceEngineChat' instead.
+    Attributes:
+        concurrency_limit (int): Number of concurrent requests sent to a model. Default is 10,
+            which is also the maximum value.
+    Examples:
+        from .api import load_dataset
+        wml_credentials = {
+            "url": "some_url", "project_id": "some_id", "api_key": "some_key"
+        }
+        model_name = "google/flan-t5-xxl"
+        wml_inference = WMLInferenceEngineGeneration(
+            credentials=wml_credentials,
+            model_name=model_name,
+            data_classification_policy=["public"],
+            top_p=0.5,
+            random_seed=123,
+        )
+        dataset = load_dataset(
+            dataset_query="card=cards.argument_topic,template_card_index=0,loader_limit=5"
+        )
+        results = wml_inference.infer(dataset["test"])
+    """
+    concurrency_limit: int = 10
+    def verify(self):
+        super().verify()
+        assert (
+            isinstance(self.concurrency_limit, int)
+            and 1 <= self.concurrency_limit <= 10
+        ), (
+            f"'concurrency_limit' must be a positive integer not greater than 10. "
+            f"However, '{self.concurrency_limit}' was given."
         )
+    def _set_logprobs_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        user_return_options = params.pop("return_options", {})
+        # currently this is the only configuration that returns generated
+        # logprobs and behaves as expected
+        logprobs_return_options = {
+            "input_tokens": True,
+            "generated_tokens": True,
+            "token_logprobs": True,
+            "top_n_tokens": user_return_options.get("top_n_tokens", 5),
+        }
+        for key, value in logprobs_return_options.items():
+            if key in user_return_options and user_return_options[key] != value:
+                raise ValueError(
+                    f"'{key}={user_return_options[key]}' is not supported for the 'infer_log_probs' "
+                    f"method of {self.__class__.__name__}. For obtaining the logprobs of generated tokens "
+                    f"please use '{key}={value}'."
+                )
+        return {
+            **params,
+            "return_options": logprobs_return_options,
+        }
+    def _send_requests(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_logprobs: bool,
+        return_meta_data: bool,
+    ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        self.verify_not_chat_api(dataset)
+        params = self.to_dict([WMLGenerationParamsMixin], keep_empty=False)
+        if return_logprobs:
+            generation_type = "generated_tokens"
+            params = self._set_logprobs_params(params)
+        else:
+            generation_type = "generated_text"
+        inputs: List[str] = [instance["source"] for instance in dataset]
+        results = self._model.generate(
+            prompt=inputs,
+            params=params,
+            concurrency_limit=self.concurrency_limit,
+        )
+        final_results = []
+        for result, inp in zip(results, inputs):
+            result_metadata = result["results"][0]
+            generated_content = result_metadata[generation_type]
+            final_results.append(
+                self.get_return_object(
+                    generated_content, result_metadata, inp, return_meta_data
+                )
             )
+        return final_results
+    def get_return_object(self, predict_result, result, input_text, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=predict_result,
+                input_tokens=result["input_token_count"],
+                output_tokens=result["generated_token_count"],
+                model_name=self.model_name or self.deployment_id,
+                inference_type=self.label,
+                stop_reason=result["stop_reason"],
+                seed=self.random_seed,
+                input_text=input_text,
             )
+        return predict_result
+class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
+    """Creates chat session and returns a model's response.
+    You can also include images in your inputs. If you use only textual input, it is
+    recommended to use 'WMLInferenceEngineGeneration' instead as it is faster, and allows
+    more parameters for text generation.
+    You can provide either already formatted messages, or a raw dataset as an input.
+    In case of the former, all passed images should be base64-encoded strings given as
+    an 'image_url' within a message. Moreover, only one image per a list of messages
+    may be sent.
+    As for the latter, if there are multiple images per one instance, they will be sent
+    separately with the same query. If that could possibly affect expected responses,
+    concatenate images within an instance into a single image and adjust your query
+    accordingly (if necessary).
+    Attributes:
+        image_encoder (EncodeImageToString, optional): operator which encodes images in
+            given format to base64 strings required by service. You should specify it when
+            you are using images in your inputs.
+    Example:
+        from .api import load_dataset
+        from .image_operators
+        image_encoder = EncodeImageToString(image_format="JPEG")
+        wml_credentials = {
+            "url": "some_url", "project_id": "some_id", "api_key": "some_key"
+        }
+        model_name = "meta-llama/llama-3-2-11b-vision-instruct"
+        wml_inference = WMLInferenceEngineChat(
+            credentials=wml_credentials,
+            model_name=model_name,
+            image_encoder=image_encoder,
+            data_classification_policy=["public"],
+            max_tokens=1024,
+        )
+        dataset = load_dataset(
+            dataset_query="card=cards.doc_vqa.en,template=templates.qa.with_context.with_type,loader_limit=30"
+        )
+        results = wml_inference.infer(dataset["test"])
+    """
+    image_encoder: Optional[EncodeImageToString] = None
+    @staticmethod
+    def _extract_queries(instance: Dict[str, Any]) -> Tuple[Optional[str], List]:
+        task_data = instance["task_data"]
+        if isinstance(task_data, str):
+            task_data = json.loads(task_data)
+        question = task_data.get("question")
+        images = [None]
+        if "images" in instance["media"]:
+            images = extract_images(instance["source"], instance)
+        return question or instance["source"], images
+    def _create_messages_from_instance(
+        self, instance: Dict[str, Any]
+    ) -> List[List[Dict[str, Any]]]:
+        """Method creates chat messages to be sent to a watsonx.ai model based on a given instance from a dataset."""
+        text, images = self._extract_queries(instance)
+        messages: List[List[Dict[str, Any]]] = []
+        base_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": text,
+                }
+            ],
+        }
+        # Iteration over all possible images to create a separate message for
+        # every single image, since SDK allows only one image per request.
+        for image in images:
+            message = base_message.copy()
+            if image is not None:
+                encoded_image = image
+                if not isinstance(encoded_image, str):
+                    if self.image_encoder is None:
+                        raise ValueError(
+                            "If sending image queries as well, and they are not "
+                            "already encoded to base64 strings, you must specify "
+                            "the 'image_encoder' to be used."
+                        )
+                    encoded_image = self.image_encoder.encode_image_to_base64(image)
+                message["content"].append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "data:image/jpeg;base64," + encoded_image,
+                        },
+                    }
+                )
+            messages.append([message])
+        return messages
+    @staticmethod
+    def verify_messages(messages: List[Dict[str, Any]]):
+        """Method verifies if externally provided messages containing images are compatible with the format required by ibm-watsonx-ai."""
+        n_images = 0
+        for message in messages:
+            if isinstance(message["content"], str):
+                continue
+            for content in message["content"]:
+                if isinstance(content, dict):
+                    if "image" in content["type"] and content["type"] != "image_url":
+                        raise ValueError(
+                            f"ibm-watsonx-ai only supports sending images as base64-encoded "
+                            f"strings, which should be given as 'image_url' in a message. "
+                            f"However, '{content['type']}' was given."
+                        )
+                    if content["type"] == "image_url":
+                        n_images += 1
+                    if n_images > 1:
+                        raise ValueError(
+                            "ibm-watsonx-ai only supports sending one image per a list "
+                            "of messages."
+                        )
+    def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]:
+        if isinstance(instance["source"], str) and "media" in instance:
+            return self._create_messages_from_instance(instance)
+        messages = super().to_messages(instance)
+        self.verify_messages(messages)
+        # This is done to be compatible with inputs containing
+        # images as SDK allows sending only one image per message.
+        return [messages]
+    def _send_requests(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_logprobs: bool,
+        return_meta_data: bool,
+    ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        params = self.to_dict([WMLChatParamsMixin], keep_empty=False)
+        if return_logprobs:
+            output_type = "logprobs"
+            params["logprobs"] = True
+        else:
+            output_type = "message"
+            params["logprobs"] = False
+        final_results = []
+        for instance in dataset:
+            messages = self.to_messages(instance)
+            for message in messages:
+                result = self._model.chat(
+                    messages=message,
+                    params=params,
+                )
+                final_results.append(
+                    self.get_return_object(
+                        result["choices"][0][output_type]["content"],
+                        result,
+                        instance["source"],
+                        return_meta_data,
+                    )
+                )
+        return final_results
+    def get_return_object(self, predict_result, result, input_text, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=predict_result,
+                input_tokens=result["usage"]["prompt_tokens"],
+                output_tokens=len(predict_result)
+                if isinstance(predict_result, list)
+                else None,
+                model_name=self.model_name or self.deployment_id,
+                inference_type=self.label,
+                stop_reason=result["choices"][0]["finish_reason"],
+                input_text=input_text,
             )
+        return predict_result
+@deprecation(
+    version="2.0.0",
+    msg=" Please use either 'WMLInferenceEngineGeneration' or 'WMLInferenceEngineChat'"
+    " depending on your task.",
+)
+class WMLInferenceEngine(WMLInferenceEngineGeneration):
+    def prepare_engine(self):
+        super().prepare_engine()
+        get_logger().warning("'WMLInferenceEngine' is deprecated")
+def get_images_without_text(instance):
+    return extract_images(instance["source"], instance)
+def get_text_without_images(instance, image_token="<image>"):
+    regex = r"<" + f"{constants.image_tag}" + r'\s+src=["\'](.*?)["\']\s*/?>'
+    return re.sub(regex, image_token, instance["source"])
 class LMMSEvalBaseInferenceEngine(
     batch_size: int = 1
     image_token = "<image>"
+    _requirements_list = {
+        "lmms_eval": "Install llms-eval package using 'pip install lmms-eval==0.2.4'",
+    }
     def prepare_engine(self):
         if not self.lazy_load:
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         if not self._is_loaded():
             self._prepare_engine()

llm_as_judge.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from abc import abstractmethod
 from typing import Any, Dict, List, Literal, Optional
@@ -23,7 +24,7 @@ def get_task_data_dict(task_data):
     return json.loads(task_data) if isinstance(task_data, str) else task_data
-class LLMAsJudgeBase(BulkInstanceMetric):
     """LLM-as-judge-base metric class for evaluating correctness of generated predictions.
     Attributes:
@@ -122,7 +123,7 @@ class LLMAsJudgeBase(BulkInstanceMetric):
         pass
-class LLMAsJudge(LLMAsJudgeBase, ArtifactFetcherMixin):
     """LLM-as-judge-based metric class for evaluating correctness of generated predictions.
     This class uses the source prompt given to the generator and the generator's predictions to evaluate
@@ -371,6 +372,17 @@ class TaskBasedLLMasJudge(LLMAsJudgeBase):
         super().prepare()
         self.reduction_map = {"mean": [self.main_score]}
         self.score_prefix = f"{self.inference_model.get_engine_id()}_"
     def get_full_task_name(self):
         return self.task

+import re
 from abc import abstractmethod
 from typing import Any, Dict, List, Literal, Optional
     return json.loads(task_data) if isinstance(task_data, str) else task_data
+class LLMAsJudgeBase(BulkInstanceMetric, ArtifactFetcherMixin):
     """LLM-as-judge-base metric class for evaluating correctness of generated predictions.
     Attributes:
         pass
+class LLMAsJudge(LLMAsJudgeBase):
     """LLM-as-judge-based metric class for evaluating correctness of generated predictions.
     This class uses the source prompt given to the generator and the generator's predictions to evaluate
         super().prepare()
         self.reduction_map = {"mean": [self.main_score]}
         self.score_prefix = f"{self.inference_model.get_engine_id()}_"
+        if not self.format:
+            self.set_format_for_inference_engine()
+    # if format is not directly set in constructor, choose according to the inference model
+    def set_format_for_inference_engine(self):
+        model_name = self.inference_model.get_engine_id()
+        if re.search("llama.?3.*instruct", model_name):
+            format_name = "formats.llama3_instruct"
+        else:
+            format_name = "formats.empty"
+        self.format = self.get_artifact(format_name)
     def get_full_task_name(self):
         return self.task

loaders.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """This section describes unitxt loaders.
 Loaders: Generators of Unitxt Multistreams from existing date sources
-==============================================================
 Unitxt is all about readily preparing of any given data source for feeding into any given language model, and then,
 post-processing the model's output, preparing it for any given evaluator.
@@ -16,14 +16,14 @@ All these loaders inherit from Loader, and hence, implementing a loader to expan
 straightforward.
 Available Loaders Overview:
-    - :ref:`LoadHF <unitxt.loaders.LoadHF>` - Loads data from HuggingFace Datasets.
-    - :ref:`LoadCSV <unitxt.loaders.LoadCSV>` - Imports data from CSV (Comma-Separated Values) files.
-    - :ref:`LoadFromKaggle <unitxt.loaders.LoadFromKaggle>` - Retrieves datasets from the Kaggle community site.
-    - :ref:`LoadFromIBMCloud <unitxt.loaders.LoadFromIBMCloud>` - Fetches datasets hosted on IBM Cloud.
-    - :ref:`LoadFromSklearn <unitxt.loaders.LoadFromSklearn>` - Loads datasets available through the sklearn library.
-    - :ref:`MultipleSourceLoader <unitxt.loaders.MultipleSourceLoader>` - Combines data from multiple different sources.
-    - :ref:`LoadFromDictionary <unitxt.loaders.LoadFromDictionary>` - Loads data from a user-defined Python dictionary.
-    - :ref:`LoadFromHFSpace <unitxt.loaders.LoadFromHFSpace>` - Downloads and loads data from HuggingFace Spaces.

 """This section describes unitxt loaders.
 Loaders: Generators of Unitxt Multistreams from existing date sources
+=====================================================================
 Unitxt is all about readily preparing of any given data source for feeding into any given language model, and then,
 post-processing the model's output, preparing it for any given evaluator.
 straightforward.
 Available Loaders Overview:
+    - :class:`LoadHF <unitxt.loaders.LoadHF>` - Loads data from HuggingFace Datasets.
+    - :class:`LoadCSV <unitxt.loaders.LoadCSV>` - Imports data from CSV (Comma-Separated Values) files.
+    - :class:`LoadFromKaggle <unitxt.loaders.LoadFromKaggle>` - Retrieves datasets from the Kaggle community site.
+    - :class:`LoadFromIBMCloud <unitxt.loaders.LoadFromIBMCloud>` - Fetches datasets hosted on IBM Cloud.
+    - :class:`LoadFromSklearn <unitxt.loaders.LoadFromSklearn>` - Loads datasets available through the sklearn library.
+    - :class:`MultipleSourceLoader <unitxt.loaders.MultipleSourceLoader>` - Combines data from multiple different sources.
+    - :class:`LoadFromDictionary <unitxt.loaders.LoadFromDictionary>` - Loads data from a user-defined Python dictionary.
+    - :class:`LoadFromHFSpace <unitxt.loaders.LoadFromHFSpace>` - Downloads and loads data from HuggingFace Spaces.

metrics.py CHANGED Viewed

@@ -18,6 +18,7 @@ from scipy.stats import bootstrap
 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
 from .dataclass import (
     AbstractField,
     InternalField,
@@ -50,6 +51,12 @@ settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def abstract_factory():
     return {}

 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
+from .collections import ListCollection
 from .dataclass import (
     AbstractField,
     InternalField,
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
+class MetricsList(ListCollection):
+    def verify(self):
+        for metric in self.items:
+            assert isinstance(metric, Metric)
 def abstract_factory():
     return {}

operators.py CHANGED Viewed

@@ -1617,7 +1617,7 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
     calc_confidence_intervals: bool
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        from .metrics import Metric
         # Number of instances in input stream is assumed to be small. This is why
         # each metric consumes all of them and lays them in its main memory, and even generates
@@ -1646,18 +1646,25 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
         if isinstance(metric_names, str):
             metric_names = [metric_names]
         # Each metric operator computes its score and then sets the main score, overwriting
         # the previous main score value (if any). So, we need to reverse the order of the listed metrics.
         # This will cause the first listed metric to run last, and the main score will be set
         # by the first listed metric (as desired).
-        metric_names = list(reversed(metric_names))
-        for metric_name in metric_names:
-            metric = self.get_artifact(metric_name)
-            assert isinstance(
-                metric, Metric
-            ), f"Operator {metric_name} must be a Metric"
             if not self.calc_confidence_intervals:
                 metric.disable_confidence_interval_calculation()
             multi_stream = MultiStream(

     calc_confidence_intervals: bool
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        from .metrics import Metric, MetricsList
         # Number of instances in input stream is assumed to be small. This is why
         # each metric consumes all of them and lays them in its main memory, and even generates
         if isinstance(metric_names, str):
             metric_names = [metric_names]
+        metrics_list = []
+        for metric_name in metric_names:
+            metric = self.get_artifact(metric_name)
+            if isinstance(metric, MetricsList):
+                metrics_list.extend(list(metric.items))
+            elif isinstance(metric, Metric):
+                metrics_list.append(metric)
+            else:
+                raise ValueError(
+                    f"Operator {metric_name} must be a Metric or MetricsList"
+                )
         # Each metric operator computes its score and then sets the main score, overwriting
         # the previous main score value (if any). So, we need to reverse the order of the listed metrics.
         # This will cause the first listed metric to run last, and the main score will be set
         # by the first listed metric (as desired).
+        metrics_list = list(reversed(metrics_list))
+        for metric in metrics_list:
             if not self.calc_confidence_intervals:
                 metric.disable_confidence_interval_calculation()
             multi_stream = MultiStream(

settings_utils.py CHANGED Viewed

@@ -161,8 +161,8 @@ if Constants.is_uninitilized():
     constants.metric_file = os.path.join(os.path.dirname(__file__), "metric.py")
     constants.local_catalog_path = os.path.join(os.path.dirname(__file__), "catalog")
     unitxt_pkg = importlib.util.find_spec("unitxt")
-    constants.package_dir = os.path.dirname(unitxt_pkg.origin)
     if unitxt_pkg and unitxt_pkg.origin:
         constants.default_catalog_path = os.path.join(constants.package_dir, "catalog")
     else:
         constants.default_catalog_path = constants.local_catalog_path

     constants.metric_file = os.path.join(os.path.dirname(__file__), "metric.py")
     constants.local_catalog_path = os.path.join(os.path.dirname(__file__), "catalog")
     unitxt_pkg = importlib.util.find_spec("unitxt")
     if unitxt_pkg and unitxt_pkg.origin:
+        constants.package_dir = os.path.dirname(unitxt_pkg.origin)
         constants.default_catalog_path = os.path.join(constants.package_dir, "catalog")
     else:
         constants.default_catalog_path = constants.local_catalog_path

standard.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from typing import List, Optional, Union
 from .artifact import fetch_artifact
-from .augmentors import (
-    Augmentor,
-)
 from .card import TaskCard
 from .collections_operators import GetLength
 from .dataclass import Field, InternalField, NonPositionalField, OptionalField
@@ -21,6 +19,7 @@ from .stream import MultiStream
 from .system_prompts import EmptySystemPrompt, SystemPrompt
 from .task import Task
 from .templates import ApplyRandomTemplate, ApplySingleTemplate, Template, TemplatesList
 from .utils import LRUCache
 constants = get_constants()
@@ -305,7 +304,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
         self.processing.steps.append(self.task)
-        if self.augmentor is not None:
             if (
                 self.card.task.augmentable_inputs is None
                 or len(self.task.augmentable_inputs) == 0
@@ -484,14 +483,12 @@ class StandardRecipe(StandardRecipeWithIndexes):
         sampler (Sampler, optional): The Sampler used to select the demonstrations when num_demos > 0.
         steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe.
         augmentor (Augmentor) : Augmentor to be used to pseudo randomly augment the source text
-        instruction_card_index (int, optional): Index of instruction card to be used
-            for preparing the recipe.
-        template_card_index (int, optional): Index of template card to be used for
-            preparing the recipe.
     Methods:
         prepare(): This overridden method is used for preparing the recipe
-            by arranging all the steps, refiners, and renderers in a sequential manner.
     Raises:
         AssertionError: If both template and template_card_index are specified at the same time.

 from typing import List, Optional, Union
 from .artifact import fetch_artifact
+from .augmentors import Augmentor, NullAugmentor
 from .card import TaskCard
 from .collections_operators import GetLength
 from .dataclass import Field, InternalField, NonPositionalField, OptionalField
 from .system_prompts import EmptySystemPrompt, SystemPrompt
 from .task import Task
 from .templates import ApplyRandomTemplate, ApplySingleTemplate, Template, TemplatesList
+from .type_utils import isoftype
 from .utils import LRUCache
 constants = get_constants()
         self.processing.steps.append(self.task)
+        if self.augmentor is not None and not isoftype(self.augmentor, NullAugmentor):
             if (
                 self.card.task.augmentable_inputs is None
                 or len(self.task.augmentable_inputs) == 0
         sampler (Sampler, optional): The Sampler used to select the demonstrations when num_demos > 0.
         steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe.
         augmentor (Augmentor) : Augmentor to be used to pseudo randomly augment the source text
+        instruction_card_index (int, optional): Index of instruction card to be used for preparing the recipe.
+        template_card_index (int, optional): Index of template card to be used for preparing the recipe.
     Methods:
         prepare(): This overridden method is used for preparing the recipe
+        by arranging all the steps, refiners, and renderers in a sequential manner.
     Raises:
         AssertionError: If both template and template_card_index are specified at the same time.

task.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
 from .deprecation_utils import deprecation
 from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .logging_utils import get_logger
 from .operator import InstanceOperator
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants
@@ -186,31 +187,34 @@ class Task(InstanceOperator, ArtifactFetcherMixin):
     @classmethod
     @lru_cache(maxsize=None)
-    def get_metric_prediction_type(cls, metric_id: str):
         metric = cls.get_artifact(metric_id)
-        return metric.prediction_type
     def check_metrics_type(self) -> None:
         prediction_type = self.prediction_type
         for metric_id in self.metrics:
-            metric_prediction_type = Task.get_metric_prediction_type(metric_id)
-            if (
-                prediction_type == metric_prediction_type
-                or prediction_type == Any
-                or metric_prediction_type == Any
-                or (
-                    get_origin(metric_prediction_type) is Union
-                    and prediction_type in get_args(metric_prediction_type)
-                )
-            ):
-                continue
-            raise UnitxtError(
-                f"The task's prediction type ({prediction_type}) and '{metric_id}' "
-                f"metric's prediction type ({metric_prediction_type}) are different.",
-                Documentation.ADDING_TASK,
-            )
     def verify_defaults(self):
         if self.defaults:

 from .deprecation_utils import deprecation
 from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .logging_utils import get_logger
+from .metrics import MetricsList
 from .operator import InstanceOperator
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants
     @classmethod
     @lru_cache(maxsize=None)
+    def get_metrics_artifacts(cls, metric_id: str):
         metric = cls.get_artifact(metric_id)
+        if isinstance(metric, MetricsList):
+            return metric.items
+        return [metric]
     def check_metrics_type(self) -> None:
         prediction_type = self.prediction_type
         for metric_id in self.metrics:
+            metric_artifacts_list = Task.get_metrics_artifacts(metric_id)
+            for metric_artifact in metric_artifacts_list:
+                metric_prediction_type = metric_artifact.prediction_type
+                if (
+                    prediction_type == metric_prediction_type
+                    or prediction_type == Any
+                    or metric_prediction_type == Any
+                    or (
+                        get_origin(metric_prediction_type) is Union
+                        and prediction_type in get_args(metric_prediction_type)
+                    )
+                ):
+                    continue
+                raise UnitxtError(
+                    f"The task's prediction type ({prediction_type}) and '{metric_id}' "
+                    f"metric's prediction type ({metric_prediction_type}) are different.",
+                    Documentation.ADDING_TASK,
+                )
     def verify_defaults(self):
         if self.defaults:

text_utils.py CHANGED Viewed

@@ -137,7 +137,8 @@ def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
         if len(d) == 0:
             return ["{}"]
         for key, val in d.items():
-            res.append(key + ": ")
             yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
             assert len(yaml_for_val) > 0
             if is_simple(val):

         if len(d) == 0:
             return ["{}"]
         for key, val in d.items():
+            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
+            res.append(printable_key + ": ")
             yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
             assert len(yaml_for_val) > 0
             if is_simple(val):

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.15.6"


1	+ version = "1.15.7"