koichi12 commited on Feb 12, 2025

Commit

f39d59b

verified ·

1 Parent(s): 68246e2

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/train/__init__.py +90 -0
.venv/lib/python3.11/site-packages/ray/train/_checkpoint.py +424 -0
.venv/lib/python3.11/site-packages/ray/train/backend.py +59 -0
.venv/lib/python3.11/site-packages/ray/train/base_trainer.py +827 -0
.venv/lib/python3.11/site-packages/ray/train/constants.py +118 -0
.venv/lib/python3.11/site-packages/ray/train/context.py +139 -0
.venv/lib/python3.11/site-packages/ray/train/data_parallel_trainer.py +587 -0
.venv/lib/python3.11/site-packages/ray/train/error.py +6 -0
.venv/lib/python3.11/site-packages/ray/train/examples/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/examples/mlflow_simple_example.py +55 -0
.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_autoencoder_example.py +77 -0
.venv/lib/python3.11/site-packages/ray/train/huggingface/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/huggingface/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__init__.py +12 -0
.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/_transformers_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/_transformers_utils.py +143 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__init__.py +18 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/_lightgbm_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_checkpoint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_trainer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/_lightgbm_utils.py +170 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/config.py +89 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_checkpoint.py +70 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_predictor.py +152 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_trainer.py +221 -0
.venv/lib/python3.11/site-packages/ray/train/lightgbm/v2.py +132 -0
.venv/lib/python3.11/site-packages/ray/train/predictor.py +254 -0
.venv/lib/python3.11/site-packages/ray/train/session.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/trainer.py +194 -0
.venv/lib/python3.11/site-packages/ray/train/utils.py +19 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__init__.py +20 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/_xgboost_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_checkpoint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_trainer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/_xgboost_utils.py +210 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/config.py +202 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/v2.py +133 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_checkpoint.py +75 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_predictor.py +160 -0
.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_trainer.py +222 -0
.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/__init__.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/ray/train/__init__.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Try import ray[train] core requirements (defined in setup.py)
+# isort: off
+try:
+    import fsspec  # noqa: F401
+    import pandas  # noqa: F401
+    import pyarrow  # noqa: F401
+    import requests  # noqa: F401
+except ImportError as exc:
+    raise ImportError(
+        "Can't import ray.train as some dependencies are missing. "
+        'Run `pip install "ray[train]"` to fix.'
+    ) from exc
+# isort: on
+from ray._private.usage import usage_lib
+from ray.air.config import CheckpointConfig, FailureConfig, RunConfig, ScalingConfig
+from ray.air.result import Result
+# Import this first so it can be used in other modules
+from ray.train._checkpoint import Checkpoint
+from ray.train._internal.data_config import DataConfig
+from ray.train._internal.session import get_checkpoint, get_dataset_shard, report
+from ray.train._internal.syncer import SyncConfig
+from ray.train.backend import BackendConfig
+from ray.train.constants import TRAIN_DATASET_KEY
+from ray.train.context import get_context
+from ray.train.trainer import TrainingIterator
+from ray.train.v2._internal.constants import is_v2_enabled
+if is_v2_enabled():
+    from ray.train.v2.api.callback import UserCallback  # noqa: F811
+    from ray.train.v2.api.config import (  # noqa: F811
+        FailureConfig,
+        RunConfig,
+        ScalingConfig,
+    )
+    from ray.train.v2.api.result import Result  # noqa: F811
+    from ray.train.v2.api.train_fn_utils import (  # noqa: F811
+        get_checkpoint,
+        get_context,
+        get_dataset_shard,
+        report,
+    )
+usage_lib.record_library_usage("train")
+Checkpoint.__module__ = "ray.train"
+__all__ = [
+    "get_checkpoint",
+    "get_context",
+    "get_dataset_shard",
+    "report",
+    "BackendConfig",
+    "Checkpoint",
+    "CheckpointConfig",
+    "DataConfig",
+    "FailureConfig",
+    "Result",
+    "RunConfig",
+    "ScalingConfig",
+    "SyncConfig",
+    "TrainingIterator",
+    "TRAIN_DATASET_KEY",
+]
+get_checkpoint.__module__ = "ray.train"
+get_context.__module__ = "ray.train"
+get_dataset_shard.__module__ = "ray.train"
+report.__module__ = "ray.train"
+BackendConfig.__module__ = "ray.train"
+Checkpoint.__module__ = "ray.train"
+CheckpointConfig.__module__ = "ray.train"
+DataConfig.__module__ = "ray.train"
+FailureConfig.__module__ = "ray.train"
+Result.__module__ = "ray.train"
+RunConfig.__module__ = "ray.train"
+ScalingConfig.__module__ = "ray.train"
+SyncConfig.__module__ = "ray.train"
+TrainingIterator.__module__ = "ray.train"
+if is_v2_enabled():
+    __all__.append("UserCallback")
+    UserCallback.__module__ = "ray.train"
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/_checkpoint.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import contextlib
+import glob
+import json
+import logging
+import os
+import platform
+import shutil
+import tempfile
+import traceback
+import uuid
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Union
+import pyarrow.fs
+from ray.air._internal.filelock import TempFileLock
+from ray.train._internal.storage import _download_from_fs_path, _exists_at_fs_path
+from ray.util.annotations import PublicAPI
+logger = logging.getLogger(__name__)
+# The filename of the file that stores user metadata set on the checkpoint.
+_METADATA_FILE_NAME = ".metadata.json"
+# The prefix of the temp checkpoint directory that `to_directory` downloads to
+# on the local filesystem.
+_CHECKPOINT_TEMP_DIR_PREFIX = "checkpoint_tmp_"
+class _CheckpointMetaClass(type):
+    def __getattr__(self, item):
+        try:
+            return super().__getattribute__(item)
+        except AttributeError as exc:
+            if item in {
+                "from_dict",
+                "to_dict",
+                "from_bytes",
+                "to_bytes",
+                "get_internal_representation",
+            }:
+                raise _get_migration_error(item) from exc
+            elif item in {
+                "from_uri",
+                "to_uri",
+                "uri",
+            }:
+                raise _get_uri_error(item) from exc
+            elif item in {"get_preprocessor", "set_preprocessor"}:
+                raise _get_preprocessor_error(item) from exc
+            raise exc
+@PublicAPI(stability="beta")
+class Checkpoint(metaclass=_CheckpointMetaClass):
+    """A reference to data persisted as a directory in local or remote storage.
+    Access the checkpoint contents locally using ``checkpoint.to_directory()``
+    or ``checkpoint.as_directory``.
+    Attributes
+    ----------
+    path: A path on the filesystem containing the checkpoint contents.
+    filesystem: PyArrow FileSystem that can be used to access data at the `path`.
+    See Also
+    --------
+    ray.train.report : Report a checkpoint during training (with Ray Train/Tune).
+    ray.train.get_checkpoint : Get the latest checkpoint during training
+        (for restoration).
+    :ref:`train-checkpointing`
+    :ref:`persistent-storage-guide`
+    Examples
+    --------
+    Creating a checkpoint using ``Checkpoint.from_directory``:
+        >>> from ray.train import Checkpoint
+        >>> checkpoint = Checkpoint.from_directory("/tmp/example_checkpoint_dir")
+        >>> checkpoint.filesystem  # doctest: +ELLIPSIS
+        <pyarrow._fs.LocalFileSystem object...
+        >>> checkpoint.path
+        '/tmp/example_checkpoint_dir'
+    Creating a checkpoint from a remote URI:
+        >>> checkpoint = Checkpoint("s3://bucket/path/to/checkpoint")
+        >>> checkpoint.filesystem  # doctest: +ELLIPSIS
+        <pyarrow._s3fs.S3FileSystem object...
+        >>> checkpoint.path
+        'bucket/path/to/checkpoint'
+    Creating a checkpoint with a custom filesystem:
+        >>> checkpoint = Checkpoint(
+        ...     path="bucket/path/to/checkpoint",
+        ...     filesystem=pyarrow.fs.S3FileSystem(),
+        ... )
+        >>> checkpoint.filesystem  # doctest: +ELLIPSIS
+        <pyarrow._s3fs.S3FileSystem object...
+        >>> checkpoint.path
+        'bucket/path/to/checkpoint'
+    Accessing a checkpoint's contents:
+        >>> import os  # doctest: +SKIP
+        >>> with checkpoint.as_directory() as local_checkpoint_dir:  # doctest: +SKIP
+        ...    print(os.listdir(local_checkpoint_dir))  # doctest: +SKIP
+        ['model.pt', 'optimizer.pt', 'misc.pt']
+    """
+    def __init__(
+        self,
+        path: Union[str, os.PathLike],
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+    ):
+        """Construct a Checkpoint.
+        Args:
+            path: A local path or remote URI containing the checkpoint data.
+                If a filesystem is provided, then this path must NOT be a URI.
+                It should be a path on the filesystem with the prefix already stripped.
+            filesystem: PyArrow FileSystem to use to access data at the path.
+                If not specified, this is inferred from the URI scheme.
+        """
+        self.path = str(path)
+        self.filesystem = filesystem
+        if path and not filesystem:
+            self.filesystem, self.path = pyarrow.fs.FileSystem.from_uri(path)
+        # This random UUID is used to create a temporary directory name on the
+        # local filesystem, which will be used for downloading checkpoint data.
+        # This ensures that if multiple processes download the same checkpoint object
+        # only one process performs the actual download while the others wait.
+        # This prevents duplicated download efforts and data.
+        # NOTE: Calling `to_directory` from multiple `Checkpoint` objects
+        # that point to the same (fs, path) will still download the data multiple times.
+        # This only ensures a canonical temp directory name for a single `Checkpoint`.
+        self._uuid = uuid.uuid4()
+    def __repr__(self):
+        return f"Checkpoint(filesystem={self.filesystem.type_name}, path={self.path})"
+    def get_metadata(self) -> Dict[str, Any]:
+        """Return the metadata dict stored with the checkpoint.
+        If no metadata is stored, an empty dict is returned.
+        """
+        metadata_path = Path(self.path, _METADATA_FILE_NAME).as_posix()
+        if not _exists_at_fs_path(self.filesystem, metadata_path):
+            return {}
+        with self.filesystem.open_input_file(metadata_path) as f:
+            return json.loads(f.readall().decode("utf-8"))
+    def set_metadata(self, metadata: Dict[str, Any]) -> None:
+        """Set the metadata stored with this checkpoint.
+        This will overwrite any existing metadata stored with this checkpoint.
+        """
+        metadata_path = Path(self.path, _METADATA_FILE_NAME).as_posix()
+        with self.filesystem.open_output_stream(metadata_path) as f:
+            f.write(json.dumps(metadata).encode("utf-8"))
+    def update_metadata(self, metadata: Dict[str, Any]) -> None:
+        """Update the metadata stored with this checkpoint.
+        This will update any existing metadata stored with this checkpoint.
+        """
+        existing_metadata = self.get_metadata()
+        existing_metadata.update(metadata)
+        self.set_metadata(existing_metadata)
+    @classmethod
+    def from_directory(cls, path: Union[str, os.PathLike]) -> "Checkpoint":
+        """Create checkpoint object from a local directory.
+        Args:
+            path: Local directory containing checkpoint data.
+        Returns:
+            A ray.train.Checkpoint object.
+        """
+        return cls(path, filesystem=pyarrow.fs.LocalFileSystem())
+    def to_directory(self, path: Optional[Union[str, os.PathLike]] = None) -> str:
+        """Write checkpoint data to a local directory.
+        *If multiple processes on the same node call this method simultaneously,*
+        only a single process will perform the download, while the others
+        wait for the download to finish. Once the download finishes, all processes
+        receive the same local directory to read from.
+        Args:
+            path: Target directory to download data to. If not specified,
+                this method will use a temporary directory.
+        Returns:
+            str: Directory containing checkpoint data.
+        """
+        user_provided_path = path is not None
+        local_path = (
+            path if user_provided_path else self._get_temporary_checkpoint_dir()
+        )
+        local_path = os.path.normpath(os.path.expanduser(str(local_path)))
+        os.makedirs(local_path, exist_ok=True)
+        try:
+            # Timeout 0 means there will be only one attempt to acquire
+            # the file lock. If it cannot be acquired, throw a TimeoutError
+            with TempFileLock(local_path, timeout=0):
+                _download_from_fs_path(
+                    fs=self.filesystem, fs_path=self.path, local_path=local_path
+                )
+        except TimeoutError:
+            # if the directory is already locked, then wait but do not do anything.
+            with TempFileLock(local_path, timeout=-1):
+                pass
+            if not os.path.exists(local_path):
+                raise RuntimeError(
+                    f"Checkpoint directory {local_path} does not exist, "
+                    "even though it should have been created by "
+                    "another process. Please raise an issue on GitHub: "
+                    "https://github.com/ray-project/ray/issues"
+                )
+        return local_path
+    @contextlib.contextmanager
+    def as_directory(self) -> Iterator[str]:
+        """Returns checkpoint contents in a local directory as a context.
+        This function makes checkpoint data available as a directory while avoiding
+        unnecessary copies and left-over temporary data.
+        *If the checkpoint points to a local directory*, this method just returns the
+        local directory path without making a copy, and nothing will be cleaned up
+        after exiting the context.
+        *If the checkpoint points to a remote directory*, this method will download the
+        checkpoint to a local temporary directory and return the path
+        to the temporary directory.
+        *If multiple processes on the same node call this method simultaneously,*
+        only a single process will perform the download, while the others
+        wait for the download to finish. Once the download finishes, all processes
+        receive the same local (temporary) directory to read from.
+        Once all processes have finished working with the checkpoint,
+        the temporary directory is cleaned up.
+        Users should treat the returned checkpoint directory as read-only and avoid
+        changing any data within it, as it may be deleted when exiting the context.
+        Example:
+        .. testcode::
+            :hide:
+            from pathlib import Path
+            import tempfile
+            from ray.train import Checkpoint
+            temp_dir = tempfile.mkdtemp()
+            (Path(temp_dir) / "example.txt").write_text("example checkpoint data")
+            checkpoint = Checkpoint.from_directory(temp_dir)
+        .. testcode::
+            with checkpoint.as_directory() as checkpoint_dir:
+                # Do some read-only processing of files within checkpoint_dir
+                pass
+            # At this point, if a temporary directory was created, it will have
+            # been deleted.
+        """
+        if isinstance(self.filesystem, pyarrow.fs.LocalFileSystem):
+            yield self.path
+        else:
+            del_lock_path = _get_del_lock_path(self._get_temporary_checkpoint_dir())
+            open(del_lock_path, "a").close()
+            temp_dir = self.to_directory()
+            try:
+                yield temp_dir
+            finally:
+                # Always cleanup the del lock after we're done with the directory.
+                # This avoids leaving a lock file behind in the case of an exception
+                # in the user code.
+                try:
+                    os.remove(del_lock_path)
+                except Exception:
+                    logger.warning(
+                        f"Could not remove {del_lock_path} deletion file lock. "
+                        f"Traceback:\n{traceback.format_exc()}"
+                    )
+                # If there are no more lock files, that means there are no more
+                # readers of this directory, and we can safely delete it.
+                # In the edge case (process crash before del lock file is removed),
+                # we do not remove the directory at all.
+                # Since it's in /tmp, this is not that big of a deal.
+                # check if any lock files are remaining
+                remaining_locks = _list_existing_del_locks(temp_dir)
+                if not remaining_locks:
+                    try:
+                        # Timeout 0 means there will be only one attempt to acquire
+                        # the file lock. If it cannot be acquired, a TimeoutError
+                        # will be thrown.
+                        with TempFileLock(temp_dir, timeout=0):
+                            shutil.rmtree(temp_dir, ignore_errors=True)
+                    except TimeoutError:
+                        pass
+    def _get_temporary_checkpoint_dir(self) -> str:
+        """Return the name for the temporary checkpoint dir that this checkpoint
+        will get downloaded to, if accessing via `to_directory` or `as_directory`.
+        """
+        tmp_dir_path = tempfile.gettempdir()
+        checkpoint_dir_name = _CHECKPOINT_TEMP_DIR_PREFIX + self._uuid.hex
+        if platform.system() == "Windows":
+            # Max path on Windows is 260 chars, -1 for joining \
+            # Also leave a little for the del lock
+            del_lock_name = _get_del_lock_path("")
+            checkpoint_dir_name = (
+                _CHECKPOINT_TEMP_DIR_PREFIX
+                + self._uuid.hex[
+                    -259
+                    + len(_CHECKPOINT_TEMP_DIR_PREFIX)
+                    + len(tmp_dir_path)
+                    + len(del_lock_name) :
+                ]
+            )
+            if not checkpoint_dir_name.startswith(_CHECKPOINT_TEMP_DIR_PREFIX):
+                raise RuntimeError(
+                    "Couldn't create checkpoint directory due to length "
+                    "constraints. Try specifying a shorter checkpoint path."
+                )
+        return Path(tmp_dir_path, checkpoint_dir_name).as_posix()
+    def __fspath__(self):
+        raise TypeError(
+            "You cannot use `Checkpoint` objects directly as paths. "
+            "Use `Checkpoint.to_directory()` or `Checkpoint.as_directory()` instead."
+        )
+def _get_del_lock_path(path: str, suffix: str = None) -> str:
+    """Get the path to the deletion lock file for a file/directory at `path`.
+    Example:
+        >>> _get_del_lock_path("/tmp/checkpoint_tmp")  # doctest: +ELLIPSIS
+        '/tmp/checkpoint_tmp.del_lock_...
+        >>> _get_del_lock_path("/tmp/checkpoint_tmp/")  # doctest: +ELLIPSIS
+        '/tmp/checkpoint_tmp.del_lock_...
+        >>> _get_del_lock_path("/tmp/checkpoint_tmp.txt")  # doctest: +ELLIPSIS
+        '/tmp/checkpoint_tmp.txt.del_lock_...
+    """
+    suffix = suffix if suffix is not None else str(os.getpid())
+    return f"{path.rstrip('/')}.del_lock_{suffix}"
+def _list_existing_del_locks(path: str) -> List[str]:
+    """List all the deletion lock files for a file/directory at `path`.
+    For example, if 2 checkpoints are being read via `as_directory`,
+    then this should return a list of 2 deletion lock files.
+    """
+    return list(glob.glob(f"{_get_del_lock_path(path, suffix='*')}"))
+def _get_migration_error(name: str):
+    return AttributeError(
+        f"The new `ray.train.Checkpoint` class does not support `{name}()`. "
+        f"Instead, only directories are supported.\n\n"
+        f"Example to store a dictionary in a checkpoint:\n\n"
+        f"import os, tempfile\n"
+        f"import ray.cloudpickle as pickle\n"
+        f"from ray import train\n"
+        f"from ray.train import Checkpoint\n\n"
+        f"with tempfile.TemporaryDirectory() as checkpoint_dir:\n"
+        f"  with open(os.path.join(checkpoint_dir, 'data.pkl'), 'wb') as fp:\n"
+        f"    pickle.dump({{'data': 'value'}}, fp)\n\n"
+        f"  checkpoint = Checkpoint.from_directory(checkpoint_dir)\n"
+        f"  train.report(..., checkpoint=checkpoint)\n\n"
+        f"Example to load a dictionary from a checkpoint:\n\n"
+        f"if train.get_checkpoint():\n"
+        f"  with train.get_checkpoint().as_directory() as checkpoint_dir:\n"
+        f"    with open(os.path.join(checkpoint_dir, 'data.pkl'), 'rb') as fp:\n"
+        f"      data = pickle.load(fp)"
+    )
+def _get_uri_error(name: str):
+    return AttributeError(
+        f"The new `ray.train.Checkpoint` class does not support `{name}()`. "
+        f"To create a checkpoint from remote storage, create a `Checkpoint` using its "
+        f"constructor instead of `from_directory`.\n"
+        f'Example: `Checkpoint(path="s3://a/b/c")`.\n'
+        f"Then, access the contents of the checkpoint with "
+        f"`checkpoint.as_directory()` / `checkpoint.to_directory()`.\n"
+        f"To upload data to remote storage, use e.g. `pyarrow.fs.FileSystem` "
+        f"or your client of choice."
+    )
+def _get_preprocessor_error(name: str):
+    return AttributeError(
+        f"The new `ray.train.Checkpoint` class does not support `{name}()`. "
+        f"To include preprocessor information in checkpoints, "
+        f"pass it as metadata in the <Framework>Trainer constructor.\n"
+        f"Example: `TorchTrainer(..., metadata={{...}})`.\n"
+        f"After training, access it in the checkpoint via `checkpoint.get_metadata()`. "
+        f"See here: https://docs.ray.io/en/master/train/user-guides/"
+        f"data-loading-preprocessing.html#preprocessing-structured-data"
+    )

.venv/lib/python3.11/site-packages/ray/train/backend.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import logging
+from contextlib import nullcontext
+from typing import TypeVar
+from ray.train._internal.utils import Singleton
+from ray.train._internal.worker_group import WorkerGroup
+from ray.util.annotations import DeveloperAPI
+from ray.widgets import make_table_html_repr
+EncodedData = TypeVar("EncodedData")
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class BackendConfig:
+    """Parent class for configurations of training backend."""
+    @property
+    def backend_cls(self):
+        return Backend
+    @property
+    def train_func_context(self):
+        return nullcontext
+    def _repr_html_(self) -> str:
+        return make_table_html_repr(obj=self, title=type(self).__name__)
+@DeveloperAPI
+class Backend(metaclass=Singleton):
+    """Singleton for distributed communication backend.
+    Attributes:
+        share_cuda_visible_devices: If True, each worker
+            process will have CUDA_VISIBLE_DEVICES set as the visible device
+            IDs of all workers on the same node for this training instance.
+            If False, each worker will have CUDA_VISIBLE_DEVICES set to the
+            device IDs allocated by Ray for that worker.
+    """
+    share_cuda_visible_devices: bool = False
+    def on_start(self, worker_group: WorkerGroup, backend_config: BackendConfig):
+        """Logic for starting this backend."""
+        pass
+    def on_shutdown(self, worker_group: WorkerGroup, backend_config: BackendConfig):
+        """Logic for shutting down the backend."""
+        pass
+    def on_training_start(
+        self, worker_group: WorkerGroup, backend_config: BackendConfig
+    ):
+        """Logic ran right before training is started.
+        Session API is available at this point."""
+        pass

.venv/lib/python3.11/site-packages/ray/train/base_trainer.py ADDED Viewed

	@@ -0,0 +1,827 @@

+import abc
+import copy
+import inspect
+import json
+import logging
+import os
+import warnings
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union
+import pyarrow.fs
+import ray
+import ray.cloudpickle as pickle
+from ray._private.dict import deep_update
+from ray.air._internal import usage as air_usage
+from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
+from ray.air._internal.usage import AirEntrypoint
+from ray.air.config import RunConfig, ScalingConfig
+from ray.air.result import Result
+from ray.train import Checkpoint
+from ray.train._internal.session import get_session
+from ray.train._internal.storage import (
+    StorageContext,
+    _exists_at_fs_path,
+    get_fs_and_path,
+)
+from ray.util import PublicAPI
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    from ray.data import Dataset
+    from ray.tune import Trainable
+_TRAINER_PKL = "trainer.pkl"
+# A type representing either a ray.data.Dataset or a function that returns a
+# ray.data.Dataset and accepts no arguments.
+GenDataset = Union["Dataset", Callable[[], "Dataset"]]
+logger = logging.getLogger(__name__)
+PREPROCESSOR_DEPRECATION_MESSAGE = (
+    "The `preprocessor` argument to Trainers is deprecated as of Ray 2.7. "
+    "Instead, use the Preprocessor `fit` and `transform` APIs directly on the Ray "
+    "Dataset. For any state that needs to be saved to the trained checkpoint, pass it "
+    "in using the `metadata` argument of the `Trainer`. "
+    "For a full example, see "
+    "https://docs.ray.io/en/master/train/user-guides/data-loading-preprocessing.html#preprocessing-structured-data "  # noqa:E501
+)
+@PublicAPI(stability="beta")
+class TrainingFailedError(RuntimeError):
+    """An error indicating that training has failed."""
+    _RESTORE_MSG = (
+        "The Ray Train run failed. Please inspect the previous error messages for a "
+        "cause. After fixing the issue (assuming that the error is not caused by "
+        "your own application logic, but rather an error such as OOM), you can restart "
+        "the run from scratch or continue this run.\n"
+        "To continue this run, you can use: "
+        '`trainer = {trainer_cls_name}.restore("{path}")`.'
+    )
+    _FAILURE_CONFIG_MSG = (
+        "To start a new run that will retry on training failures, set "
+        "`train.RunConfig(failure_config=train.FailureConfig(max_failures))` "
+        "in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` "
+        "for unlimited retries."
+    )
+def _train_coordinator_fn(
+    config: dict, trainer_cls: Type["BaseTrainer"], metadata: dict
+):
+    """This is the function that defines the logic of the Ray Train coordinator.
+    This is responsible for setting up a remote instance of the `trainer_cls`
+    (a different instance than the one calling `trainer.fit` on the driver!)
+    and running the training loop.
+    """
+    assert metadata is not None, metadata
+    # Propagate user metadata from the Trainer constructor.
+    get_session().metadata = metadata
+    # config already contains merged values.
+    # Instantiate new Trainer in Trainable.
+    trainer = trainer_cls(**config)
+    # Get the checkpoint from Tune and pass it to workers later on.
+    checkpoint = ray.train.get_checkpoint()
+    if checkpoint:
+        # Set `starting_checkpoint` for auto-recovery fault-tolerance
+        # as well as manual restoration.
+        trainer.starting_checkpoint = checkpoint
+    # else: Train will restore from the user-provided
+    # `resume_from_checkpoint` == `starting_checkpoint`.
+    # Evaluate datasets if they are wrapped in a factory.
+    trainer.datasets = {
+        k: d() if callable(d) else d for k, d in trainer.datasets.items()
+    }
+    trainer.setup()
+    trainer.training_loop()
+@DeveloperAPI
+class BaseTrainer(abc.ABC):
+    """Defines interface for distributed training on Ray.
+    Note: The base ``BaseTrainer`` class cannot be instantiated directly. Only
+    one of its subclasses can be used.
+    Note to developers: If a new trainer is added, please update
+    `air/_internal/usage.py`.
+    **How does a trainer work?**
+    - First, initialize the Trainer. The initialization runs locally,
+      so heavyweight setup should not be done in ``__init__``.
+    - Then, when you call ``trainer.fit()``, the Trainer is serialized
+      and copied to a remote Ray actor. The following methods are then
+      called in sequence on the remote actor.
+    - ``trainer.setup()``: Any heavyweight Trainer setup should be
+      specified here.
+    - ``trainer.training_loop()``: Executes the main training logic.
+    - Calling ``trainer.fit()`` will return a ``ray.result.Result``
+      object where you can access metrics from your training run, as well
+      as any checkpoints that may have been saved.
+    **How do I create a new Trainer?**
+    Subclass ``ray.train.trainer.BaseTrainer``, and override the ``training_loop``
+    method, and optionally ``setup``.
+    .. testcode::
+        import torch
+        from ray.train.trainer import BaseTrainer
+        from ray import train, tune
+        class MyPytorchTrainer(BaseTrainer):
+            def setup(self):
+                self.model = torch.nn.Linear(1, 1)
+                self.optimizer = torch.optim.SGD(
+                    self.model.parameters(), lr=0.1)
+            def training_loop(self):
+                # You can access any Trainer attributes directly in this method.
+                # self.datasets["train"] has already been
+                dataset = self.datasets["train"]
+                torch_ds = dataset.iter_torch_batches(dtypes=torch.float)
+                loss_fn = torch.nn.MSELoss()
+                for epoch_idx in range(10):
+                    loss = 0
+                    num_batches = 0
+                    torch_ds = dataset.iter_torch_batches(
+                        dtypes=torch.float, batch_size=2
+                    )
+                    for batch in torch_ds:
+                        X = torch.unsqueeze(batch["x"], 1)
+                        y = torch.unsqueeze(batch["y"], 1)
+                        # Compute prediction error
+                        pred = self.model(X)
+                        batch_loss = loss_fn(pred, y)
+                        # Backpropagation
+                        self.optimizer.zero_grad()
+                        batch_loss.backward()
+                        self.optimizer.step()
+                        loss += batch_loss.item()
+                        num_batches += 1
+                    loss /= num_batches
+                    # Use Tune functions to report intermediate
+                    # results.
+                    train.report({"loss": loss, "epoch": epoch_idx})
+        # Initialize the Trainer, and call Trainer.fit()
+        import ray
+        train_dataset = ray.data.from_items(
+            [{"x": i, "y": i} for i in range(10)])
+        my_trainer = MyPytorchTrainer(datasets={"train": train_dataset})
+        result = my_trainer.fit()
+    .. testoutput::
+            :hide:
+            ...
+    Args:
+        scaling_config: Configuration for how to scale training.
+        run_config: Configuration for the execution of the training run.
+        datasets: Any Datasets to use for training. Use the key "train"
+            to denote which dataset is the training dataset.
+        metadata: Dict that should be made available via
+            `train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+        resume_from_checkpoint: A checkpoint to resume training from.
+    """
+    _scaling_config_allowed_keys: List[str] = [
+        "trainer_resources",
+    ]
+    _handles_checkpoint_freq: bool = False
+    _handles_checkpoint_at_end: bool = False
+    # fields to propagate to Tuner param_space.
+    # See `BaseTrainer._extract_fields_for_tuner_param_space` for more details.
+    _fields_for_tuner_param_space = []
+    def __init__(
+        self,
+        *,
+        scaling_config: Optional[ScalingConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        self.scaling_config = (
+            scaling_config if scaling_config is not None else ScalingConfig()
+        )
+        self.run_config = (
+            copy.copy(run_config) if run_config is not None else RunConfig()
+        )
+        self.metadata = metadata
+        self.datasets = datasets if datasets is not None else {}
+        self.starting_checkpoint = resume_from_checkpoint
+        # These attributes should only be set through `BaseTrainer.restore`
+        self._restore_path = None
+        self._restore_storage_filesystem = None
+        self._validate_attributes()
+        air_usage.tag_air_trainer(self)
+    @PublicAPI(stability="alpha")
+    @classmethod
+    def restore(
+        cls: Type["BaseTrainer"],
+        path: Union[str, os.PathLike],
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        **kwargs,
+    ) -> "BaseTrainer":
+        """Restores a Train experiment from a previously interrupted/failed run.
+        Restore should be used for experiment-level fault tolerance in the event
+        that the head node crashes (e.g., OOM or some other runtime error) or the
+        entire cluster goes down (e.g., network error affecting all nodes).
+        A run that has already completed successfully will not be resumed from this API.
+        To continue training from a successful run, launch a new run with the
+        ``<Framework>Trainer(resume_from_checkpoint)`` API instead, passing in a
+        checkpoint from the previous run to start with.
+        .. note::
+            Restoring an experiment from a path that's pointing to a *different*
+            location than the original experiment path is supported. However, Ray Train
+            assumes that the full experiment directory is available
+            (including checkpoints) so that it's possible to resume trials from their
+            latest state.
+            For example, if the original experiment path was run locally, then the
+            results are uploaded to cloud storage, Ray Train expects the full contents
+            to be available in cloud storage if attempting to resume
+            via ``<Framework>Trainer.restore("s3://...")``. The restored run will
+            continue writing results to the same cloud storage location.
+        The following example can be paired with implementing job retry using
+        :ref:`Ray Jobs <jobs-overview>` to produce a Train experiment that will
+        attempt to resume on both experiment-level and trial-level failures:
+        .. testcode::
+            import os
+            import ray
+            from ray import train
+            from ray.train.trainer import BaseTrainer
+            experiment_name = "unique_experiment_name"
+            storage_path = os.path.expanduser("~/ray_results")
+            experiment_dir = os.path.join(storage_path, experiment_name)
+            # Define some dummy inputs for demonstration purposes
+            datasets = {"train": ray.data.from_items([{"a": i} for i in range(10)])}
+            class CustomTrainer(BaseTrainer):
+                def training_loop(self):
+                    pass
+            if CustomTrainer.can_restore(experiment_dir):
+                trainer = CustomTrainer.restore(
+                    experiment_dir, datasets=datasets
+                )
+            else:
+                trainer = CustomTrainer(
+                    datasets=datasets,
+                    run_config=train.RunConfig(
+                        name=experiment_name,
+                        storage_path=storage_path,
+                        # Tip: You can also enable retries on failure for
+                        # worker-level fault tolerance
+                        failure_config=train.FailureConfig(max_failures=3),
+                    ),
+                )
+            result = trainer.fit()
+        .. testoutput::
+            :hide:
+            ...
+        Args:
+            path: The path to the experiment directory of the training run to restore.
+                This can be a local path or a remote URI if the experiment was
+                uploaded to the cloud.
+            storage_filesystem: Custom ``pyarrow.fs.FileSystem``
+                corresponding to the ``path``. This may be necessary if the original
+                experiment passed in a custom filesystem.
+            datasets: Re-specified datasets used in the original training run.
+                This must include all the datasets that were passed in the
+                original trainer constructor.
+            scaling_config: Optionally re-specified scaling config. This can be
+                modified to be different from the original spec.
+            **kwargs: Other optionally re-specified arguments, passed in by subclasses.
+        Raises:
+            ValueError: If all datasets were not re-supplied on restore.
+        Returns:
+            BaseTrainer: A restored instance of the class that is calling this method.
+        """
+        if not cls.can_restore(path, storage_filesystem):
+            raise ValueError(
+                f"Invalid restore path: {path}. Make sure that this path exists and "
+                "is the experiment directory that results from a call to "
+                "`trainer.fit()`."
+            )
+        fs, fs_path = get_fs_and_path(path, storage_filesystem)
+        trainer_pkl_path = Path(fs_path, _TRAINER_PKL).as_posix()
+        with fs.open_input_file(trainer_pkl_path) as f:
+            trainer_cls, param_dict = pickle.loads(f.readall())
+        if trainer_cls is not cls:
+            warnings.warn(
+                f"Invalid trainer type. You are attempting to restore a trainer of type"
+                f" {trainer_cls} with `{cls.__name__}.restore`, "
+                "which will most likely fail. "
+                f"Use `{trainer_cls.__name__}.restore` instead."
+            )
+        original_datasets = param_dict.pop("datasets", {})
+        if original_datasets and not datasets:
+            raise ValueError(
+                "The following datasets need to be provided again on restore: "
+                f"{list(original_datasets.keys())}\n"
+                f"Use {cls.__name__}.restore(..., datasets=datasets) "
+                "with the datasets that were provided to the original trainer."
+            )
+        datasets = datasets or {}
+        if set(original_datasets) != set(datasets):
+            raise ValueError(
+                "The provided datasets don't match the original dataset keys.\n"
+                f"  Expected datasets for the keys: {list(original_datasets.keys())}\n"
+                f"  Actual datasets provided: {list(datasets.keys())}"
+            )
+        param_dict["datasets"] = datasets
+        if scaling_config:
+            param_dict["scaling_config"] = scaling_config
+        for param_name, val in kwargs.items():
+            # Overwrite the old value if something is passed into restore
+            if val is not None:
+                param_dict[param_name] = val
+        try:
+            trainer = cls(**param_dict)
+        except Exception as e:
+            raise ValueError(
+                "Trainer restoration failed (see above for the stack trace). "
+                "Make sure that you use the right trainer class to restore: "
+                f"`{cls.__name__}.restore`\n"
+            ) from e
+        trainer._restore_path = path
+        trainer._restore_storage_filesystem = storage_filesystem
+        return trainer
+    @PublicAPI(stability="alpha")
+    @classmethod
+    def can_restore(
+        cls: Type["BaseTrainer"],
+        path: Union[str, os.PathLike],
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    ) -> bool:
+        """Checks whether a given directory contains a restorable Train experiment.
+        Args:
+            path: The path to the experiment directory of the Train experiment.
+                This can be either a local directory (e.g., ~/ray_results/exp_name)
+                or a remote URI (e.g., s3://bucket/exp_name).
+        Returns:
+            bool: Whether this path exists and contains the trainer state to resume from
+        """
+        fs, fs_path = get_fs_and_path(path, storage_filesystem)
+        trainer_pkl_path = Path(fs_path, _TRAINER_PKL).as_posix()
+        return _exists_at_fs_path(fs, trainer_pkl_path)
+    def __repr__(self):
+        # A dictionary that maps parameters to their default values.
+        default_values: Dict[str, Any] = {
+            "scaling_config": ScalingConfig(),
+            "run_config": RunConfig(),
+            "datasets": {},
+            "starting_checkpoint": None,
+        }
+        non_default_arguments = []
+        for parameter, default_value in default_values.items():
+            value = getattr(self, parameter)
+            if value != default_value:
+                non_default_arguments.append(f"{parameter}={value!r}")
+        if non_default_arguments:
+            return f"<{self.__class__.__name__} {' '.join(non_default_arguments)}>"
+        return f"<{self.__class__.__name__}>"
+    def __new__(cls, *args, **kwargs):
+        # Store the init args as attributes so this can be merged with Tune hparams.
+        trainer = super(BaseTrainer, cls).__new__(cls)
+        parameters = inspect.signature(cls.__init__).parameters
+        parameters = list(parameters.keys())
+        # Remove self.
+        parameters = parameters[1:]
+        arg_dict = dict(zip(parameters, args))
+        trainer._param_dict = {**arg_dict, **kwargs}
+        return trainer
+    def _validate_attributes(self):
+        """Called on __init()__ to validate trainer attributes."""
+        # Run config
+        if not isinstance(self.run_config, RunConfig):
+            raise ValueError(
+                f"`run_config` should be an instance of `ray.train.RunConfig`, "
+                f"found {type(self.run_config)} with value `{self.run_config}`."
+            )
+        # Scaling config
+        if not isinstance(self.scaling_config, ScalingConfig):
+            raise ValueError(
+                "`scaling_config` should be an instance of `ScalingConfig`, "
+                f"found {type(self.scaling_config)} with value `{self.scaling_config}`."
+            )
+        # Datasets
+        if not isinstance(self.datasets, dict):
+            raise ValueError(
+                f"`datasets` should be a dict mapping from a string to "
+                f"`ray.data.Dataset` objects, "
+                f"found {type(self.datasets)} with value `{self.datasets}`."
+            )
+        else:
+            for key, dataset in self.datasets.items():
+                if not isinstance(dataset, ray.data.Dataset) and not callable(dataset):
+                    raise ValueError(
+                        f"The Dataset under '{key}' key is not a "
+                        "`ray.data.Dataset`. "
+                        f"Received {dataset} instead."
+                    )
+        # Metadata.
+        self.metadata = self.metadata or {}
+        if not isinstance(self.metadata, dict):
+            raise TypeError(
+                f"The provided metadata must be a dict, was {type(self.metadata)}."
+            )
+        try:
+            self.metadata = json.loads(json.dumps(self.metadata))
+        except Exception as e:
+            raise ValueError(
+                "The provided metadata must be JSON-serializable: "
+                f"{self.metadata}: {e}"
+            )
+        if self.starting_checkpoint is not None and not isinstance(
+            self.starting_checkpoint, Checkpoint
+        ):
+            raise ValueError(
+                f"`resume_from_checkpoint` should be an instance of "
+                f"`ray.train.Checkpoint`, found {type(self.starting_checkpoint)} "
+                f"with value `{self.starting_checkpoint}`."
+            )
+    @classmethod
+    def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig:
+        """Returns scaling config dataclass after validating updated keys."""
+        ensure_only_allowed_dataclass_keys_updated(
+            dataclass=scaling_config,
+            allowed_keys=cls._scaling_config_allowed_keys,
+        )
+        return scaling_config
+    def setup(self) -> None:
+        """Called during fit() to perform initial setup on the Trainer.
+        .. note:: This method is run on a remote process.
+        This method will not be called on the driver, so any expensive setup
+        operations should be placed here and not in ``__init__``.
+        This method is called prior to ``preprocess_datasets`` and
+        ``training_loop``.
+        """
+        pass
+    def preprocess_datasets(self) -> None:
+        """Deprecated."""
+        raise DeprecationWarning(
+            "`preprocess_datasets` is no longer used, since preprocessors "
+            f"are no longer accepted by Trainers.\n{PREPROCESSOR_DEPRECATION_MESSAGE}"
+        )
+    @abc.abstractmethod
+    def training_loop(self) -> None:
+        """Loop called by fit() to run training and report results to Tune.
+        .. note:: This method runs on a remote process.
+        ``self.datasets`` have already been evaluated if they were wrapped in a factory.
+        You can use the :ref:`Ray Train utilities <train-loop-api>`
+        (:func:`train.report() <ray.train.report>` and
+        :func:`train.get_checkpoint() <ray.train.get_checkpoint>`) inside
+        this training loop.
+        Example:
+        .. testcode::
+            from ray.train.trainer import BaseTrainer
+            from ray import train
+            class MyTrainer(BaseTrainer):
+                def training_loop(self):
+                    for epoch_idx in range(5):
+                        ...
+                        train.report({"epoch": epoch_idx})
+        """
+        raise NotImplementedError
+    @PublicAPI(stability="beta")
+    def fit(self) -> Result:
+        """Runs training.
+        Returns:
+            A Result object containing the training result.
+        Raises:
+            TrainingFailedError: If any failures during the execution
+                of ``self.as_trainable()``, or during the Tune execution loop.
+        """
+        from ray.tune import ResumeConfig, TuneError
+        from ray.tune.tuner import Tuner
+        trainable = self.as_trainable()
+        param_space = self._extract_fields_for_tuner_param_space()
+        self.run_config.name = (
+            self.run_config.name or StorageContext.get_experiment_dir_name(trainable)
+        )
+        # The storage context here is only used to access the resolved
+        # storage fs and experiment path, in order to avoid duplicating that logic.
+        # This is NOT the storage context object that gets passed to remote workers.
+        storage = StorageContext(
+            storage_path=self.run_config.storage_path,
+            experiment_dir_name=self.run_config.name,
+            storage_filesystem=self.run_config.storage_filesystem,
+        )
+        if self._restore_path:
+            tuner = Tuner.restore(
+                path=self._restore_path,
+                trainable=trainable,
+                param_space=param_space,
+                _resume_config=ResumeConfig(
+                    finished=ResumeConfig.ResumeType.RESUME,
+                    unfinished=ResumeConfig.ResumeType.RESUME,
+                    errored=ResumeConfig.ResumeType.RESUME,
+                ),
+                storage_filesystem=self._restore_storage_filesystem,
+            )
+        else:
+            tuner = Tuner(
+                trainable=trainable,
+                param_space=param_space,
+                run_config=self.run_config,
+                _entrypoint=AirEntrypoint.TRAINER,
+            )
+        self._save(storage.storage_filesystem, storage.experiment_fs_path)
+        restore_msg = TrainingFailedError._RESTORE_MSG.format(
+            trainer_cls_name=self.__class__.__name__,
+            path=str(storage.experiment_fs_path),
+        )
+        try:
+            result_grid = tuner.fit()
+        except TuneError as e:
+            # Catch any `TuneError`s raised by the `Tuner.fit` call.
+            # Unwrap the `TuneError` if needed.
+            parent_error = e.__cause__ or e
+            # Raise it to the user as a `TrainingFailedError` with a message to restore.
+            raise TrainingFailedError(restore_msg) from parent_error
+        # Other exceptions get passed through directly (ex: on `fail_fast='raise'`)
+        assert len(result_grid) == 1
+        result = result_grid[0]
+        if result.error:
+            # Raise trainable errors to the user with a message to restore
+            # or configure `FailureConfig` in a new run.
+            raise TrainingFailedError(
+                "\n".join([restore_msg, TrainingFailedError._FAILURE_CONFIG_MSG])
+            ) from result.error
+        return result
+    def _save(self, fs: pyarrow.fs.FileSystem, experiment_path: str):
+        """Saves the current trainer's class along with the `param_dict` of
+        parameters passed to this trainer's constructor.
+        This is used to recreate the trainer on restore.
+        Unless a parameter is re-specified during restoration (only a subset
+        of parameters can be passed in again), that parameter will be loaded
+        from the saved copy.
+        Datasets should not be saved as part of the state. Instead, we save the
+        keys and replace the dataset values with dummy functions that will
+        raise an error if invoked. The error only serves as a guardrail for
+        misuse (e.g., manually unpickling and constructing the Trainer again)
+        and is not typically surfaced, since datasets must be re-specified
+        upon restoration.
+        """
+        param_dict = self._param_dict.copy()
+        datasets = param_dict.pop("datasets", {})
+        def raise_fn():
+            raise RuntimeError
+        if datasets:
+            param_dict["datasets"] = {
+                dataset_name: raise_fn for dataset_name in datasets
+            }
+        cls_and_param_dict = (self.__class__, param_dict)
+        fs.create_dir(experiment_path)
+        with fs.open_output_stream(Path(experiment_path, _TRAINER_PKL).as_posix()) as f:
+            f.write(pickle.dumps(cls_and_param_dict))
+    def _extract_fields_for_tuner_param_space(self) -> Dict:
+        """Extracts fields to be included in `Tuner.param_space`.
+        This is needed to leverage the full logging/integration offerings from Tune.
+        For example, `param_space` is logged automatically to wandb integration.
+        Currently only done for `train_loop_config`.
+        Returns:
+            A dictionary that should be passed to Tuner.param_space.
+        """
+        result = {}
+        for key in self._fields_for_tuner_param_space:
+            if key in self._param_dict.keys():
+                result[key] = copy.deepcopy(self._param_dict[key])
+        return result
+    def _generate_trainable_cls(self) -> Type["Trainable"]:
+        """Generates the base Trainable class.
+        Returns:
+            A Trainable class to use for training.
+        """
+        from ray.tune.execution.placement_groups import PlacementGroupFactory
+        from ray.tune.trainable import wrap_function
+        trainer_cls = self.__class__
+        scaling_config = self.scaling_config
+        metadata = self.metadata
+        train_coordinator_fn = partial(
+            _train_coordinator_fn, trainer_cls=trainer_cls, metadata=metadata
+        )
+        # Change the name of the training function to match the name of the Trainer
+        # class. This will mean the Tune trial name will match the name of Trainer on
+        # stdout messages and the results directory.
+        train_coordinator_fn.__name__ = trainer_cls.__name__
+        trainable_cls = wrap_function(train_coordinator_fn)
+        has_base_dataset = bool(self.datasets)
+        if has_base_dataset:
+            from ray.data.context import DataContext
+            dataset_context = DataContext.get_current()
+        else:
+            dataset_context = None
+        class TrainTrainable(trainable_cls):
+            """Adds default resources to the Trainable."""
+            _handles_checkpoint_freq = trainer_cls._handles_checkpoint_freq
+            _handles_checkpoint_at_end = trainer_cls._handles_checkpoint_at_end
+            @classmethod
+            def has_base_dataset(cls) -> bool:
+                """Whether a dataset is provided through the Trainer."""
+                return has_base_dataset
+            @classmethod
+            def base_scaling_config(cls) -> ScalingConfig:
+                """Returns the unchanged scaling config provided through the Trainer."""
+                return scaling_config
+            def setup(self, config, **kwargs):
+                base_config = dict(kwargs)
+                # Merge Tuner param space hyperparameters in `config` into the
+                # base config passed to the Trainer constructor, which is `base_config`.
+                # `base_config` is pulled from the object store from the usage of
+                # tune.with_parameters in `BaseTrainer.as_trainable`.
+                # run_config is not a tunable hyperparameter so it does not need to be
+                # merged.
+                run_config = base_config.pop("run_config", None)
+                self._merged_config = deep_update(
+                    base_config, self.config, new_keys_allowed=True
+                )
+                self._merged_config["run_config"] = run_config
+                merged_scaling_config = self._merged_config.get(
+                    "scaling_config", ScalingConfig()
+                )
+                if isinstance(merged_scaling_config, dict):
+                    merged_scaling_config = ScalingConfig(**merged_scaling_config)
+                self._merged_config[
+                    "scaling_config"
+                ] = self._reconcile_scaling_config_with_trial_resources(
+                    merged_scaling_config
+                )
+                if self.has_base_dataset():
+                    # Set the DataContext on the Trainer actor to the DataContext
+                    # specified on the driver.
+                    DataContext._set_current(dataset_context)
+                super(TrainTrainable, self).setup(config)
+            def _reconcile_scaling_config_with_trial_resources(
+                self, scaling_config: ScalingConfig
+            ) -> ScalingConfig:
+                """
+                ResourceChangingScheduler workaround.
+                Ensures that the scaling config matches trial resources.
+                This should be replaced with RCS returning a ScalingConfig
+                in the future.
+                """
+                trial_resources = self.trial_resources
+                # This will be false if the resources are default
+                if not isinstance(trial_resources, PlacementGroupFactory):
+                    return scaling_config
+                # Ignore ResourceChangingScheduler workaround when resource bundles
+                # are unchanged
+                if self.trial_resources == scaling_config.as_placement_group_factory():
+                    return scaling_config
+                trainer_cls._validate_scaling_config(scaling_config)
+                return ScalingConfig.from_placement_group_factory(trial_resources)
+            def _trainable_func(self, config):
+                # We ignore the config passed by Tune and instead use the merged
+                # config which includes the initial Trainer args.
+                super()._trainable_func(self._merged_config)
+            @classmethod
+            def default_resource_request(cls, config):
+                # `config["scaling_config"] is a dataclass when passed via the
+                # `scaling_config` argument in `Trainer` and is a dict when passed
+                # via the `scaling_config` key of `param_spec`.
+                # Conversion logic must be duplicated in `TrainTrainable.__init__`
+                # because this is a class method.
+                updated_scaling_config = config.get("scaling_config", scaling_config)
+                if isinstance(updated_scaling_config, dict):
+                    updated_scaling_config = ScalingConfig(**updated_scaling_config)
+                validated_scaling_config = trainer_cls._validate_scaling_config(
+                    updated_scaling_config
+                )
+                return validated_scaling_config.as_placement_group_factory()
+        return TrainTrainable
+    def as_trainable(self) -> Type["Trainable"]:
+        """Converts self to a ``tune.Trainable`` class."""
+        from ray import tune
+        base_config = self._param_dict
+        trainable_cls = self._generate_trainable_cls()
+        # Wrap with `tune.with_parameters` to handle very large values in base_config
+        return tune.with_parameters(trainable_cls, **base_config)

.venv/lib/python3.11/site-packages/ray/train/constants.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from pathlib import Path
+import ray
+from ray._private.ray_constants import env_bool
+from ray.air.constants import (  # noqa: F401
+    COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV,
+    EVALUATION_DATASET_KEY,
+    MODEL_KEY,
+    PREPROCESSOR_KEY,
+    TRAIN_DATASET_KEY,
+)
+def _get_ray_train_session_dir() -> str:
+    assert ray.is_initialized(), "Ray must be initialized to get the session dir."
+    return Path(
+        ray._private.worker._global_node.get_session_dir_path(), "artifacts"
+    ).as_posix()
+DEFAULT_STORAGE_PATH = Path("~/ray_results").expanduser().as_posix()
+# Autofilled ray.train.report() metrics. Keys should be consistent with Tune.
+CHECKPOINT_DIR_NAME = "checkpoint_dir_name"
+TIME_TOTAL_S = "_time_total_s"
+WORKER_HOSTNAME = "_hostname"
+WORKER_NODE_IP = "_node_ip"
+WORKER_PID = "_pid"
+# Will not be reported unless ENABLE_DETAILED_AUTOFILLED_METRICS_ENV
+# env var is not 0
+DETAILED_AUTOFILLED_KEYS = {WORKER_HOSTNAME, WORKER_NODE_IP, WORKER_PID, TIME_TOTAL_S}
+# Default filename for JSON logger
+RESULT_FILE_JSON = "results.json"
+# The name of the subdirectory inside the trainer run_dir to store checkpoints.
+TRAIN_CHECKPOINT_SUBDIR = "checkpoints"
+# The key to use to specify the checkpoint id for Tune.
+# This needs to be added to the checkpoint dictionary so if the Tune trial
+# is restarted, the checkpoint_id can continue to increment.
+TUNE_CHECKPOINT_ID = "_current_checkpoint_id"
+# Deprecated configs can use this value to detect if the user has set it.
+_DEPRECATED_VALUE = "DEPRECATED"
+# ==================================================
+#               Environment Variables
+# ==================================================
+ENABLE_DETAILED_AUTOFILLED_METRICS_ENV = (
+    "TRAIN_RESULT_ENABLE_DETAILED_AUTOFILLED_METRICS"
+)
+# Integer value which if set will override the value of
+# Backend.share_cuda_visible_devices. 1 for True, 0 for False.
+ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES"
+# Integer value which if set will not share ROCR accelerator visible devices
+# across workers. 1 for True (default), 0 for False.
+ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ROCR_VISIBLE_DEVICES"
+# Integer value which if set will not share neuron-core accelerator visible cores
+# across workers. 1 for True (default), 0 for False.
+ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV = (
+    "TRAIN_ENABLE_SHARE_NEURON_CORES_ACCELERATOR"
+)
+# Integer value which if set will not share npu visible devices
+# across workers. 1 for True (default), 0 for False.
+ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ASCEND_RT_VISIBLE_DEVICES"
+# Integer value which indicates the number of seconds to wait when creating
+# the worker placement group before timing out.
+TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV = "TRAIN_PLACEMENT_GROUP_TIMEOUT_S"
+# Integer value which if set will change the placement group strategy from
+# PACK to SPREAD. 1 for True, 0 for False.
+TRAIN_ENABLE_WORKER_SPREAD_ENV = "TRAIN_ENABLE_WORKER_SPREAD"
+# Set this to 0 to disable changing the working directory of each Tune Trainable
+# or Train worker to the trial directory. Defaults to 1.
+RAY_CHDIR_TO_TRIAL_DIR = "RAY_CHDIR_TO_TRIAL_DIR"
+# Set this to 1 to count preemption errors toward `FailureConfig(max_failures)`.
+# Defaults to 0, which always retries on node preemption failures.
+RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE = "RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE"
+# Set this to 1 to start a StateActor and collect information Train Runs
+# Defaults to 0
+RAY_TRAIN_ENABLE_STATE_TRACKING = "RAY_TRAIN_ENABLE_STATE_TRACKING"
+# Set this to 1 to enable deprecation warnings for V2 migration.
+ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR = "RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS"
+def _v2_migration_warnings_enabled() -> bool:
+    return env_bool(ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR, False)
+# NOTE: When adding a new environment variable, please track it in this list.
+TRAIN_ENV_VARS = {
+    ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
+    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
+    ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
+    TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
+    TRAIN_ENABLE_WORKER_SPREAD_ENV,
+    RAY_CHDIR_TO_TRIAL_DIR,
+    RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
+    RAY_TRAIN_ENABLE_STATE_TRACKING,
+}
+# Key for AIR Checkpoint metadata in TrainingResult metadata
+CHECKPOINT_METADATA_KEY = "checkpoint_metadata"
+# Key for AIR Checkpoint world rank in TrainingResult metadata
+CHECKPOINT_RANK_KEY = "checkpoint_rank"

.venv/lib/python3.11/site-packages/ray/train/context.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import threading
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from ray.train._internal import session
+from ray.train._internal.storage import StorageContext
+from ray.train.constants import _v2_migration_warnings_enabled
+from ray.train.utils import _copy_doc, _log_deprecation_warning
+from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI
+if TYPE_CHECKING:
+    from ray.tune.execution.placement_groups import PlacementGroupFactory
+# The context singleton on this process.
+_default_context: "Optional[TrainContext]" = None
+_context_lock = threading.Lock()
+_GET_METADATA_DEPRECATION_MESSAGE = (
+    "`get_metadata` was an experimental API that accessed the metadata passed "
+    "to `<Framework>Trainer(metadata=...)`. This API can be replaced by passing "
+    "the metadata directly to the training function (e.g., via `train_loop_config`)."
+)
+_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE = (
+    "`{}` is deprecated because the concept of a `Trial` will "
+    "soon be removed in Ray Train (see here: "
+    "https://github.com/ray-project/enhancements/pull/57). "
+    "Ray Train will no longer assume that it's running within a Ray Tune `Trial` "
+    "in the future."
+)
+@PublicAPI(stability="stable")
+class TrainContext:
+    """Context containing metadata that can be accessed within Ray Train workers."""
+    @_copy_doc(session.get_experiment_name)
+    def get_experiment_name(self) -> str:
+        return session.get_experiment_name()
+    @_copy_doc(session.get_world_size)
+    def get_world_size(self) -> int:
+        return session.get_world_size()
+    @_copy_doc(session.get_world_rank)
+    def get_world_rank(self) -> int:
+        return session.get_world_rank()
+    @_copy_doc(session.get_local_rank)
+    def get_local_rank(self) -> int:
+        return session.get_local_rank()
+    @_copy_doc(session.get_local_world_size)
+    def get_local_world_size(self) -> int:
+        return session.get_local_world_size()
+    @_copy_doc(session.get_node_rank)
+    def get_node_rank(self) -> int:
+        return session.get_node_rank()
+    @DeveloperAPI
+    @_copy_doc(session.get_storage)
+    def get_storage(self) -> StorageContext:
+        return session.get_storage()
+    # Deprecated APIs
+    @Deprecated(
+        message=_GET_METADATA_DEPRECATION_MESSAGE,
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(session.get_metadata)
+    def get_metadata(self) -> Dict[str, Any]:
+        return session.get_metadata()
+    @Deprecated(
+        message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_name"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(session.get_trial_name)
+    def get_trial_name(self) -> str:
+        return session.get_trial_name()
+    @Deprecated(
+        message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_id"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(session.get_trial_id)
+    def get_trial_id(self) -> str:
+        return session.get_trial_id()
+    @Deprecated(
+        message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format(
+            "get_trial_resources"
+        ),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(session.get_trial_resources)
+    def get_trial_resources(self) -> "PlacementGroupFactory":
+        return session.get_trial_resources()
+    @Deprecated(
+        message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_dir"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(session.get_trial_dir)
+    def get_trial_dir(self) -> str:
+        return session.get_trial_dir()
+@PublicAPI(stability="stable")
+def get_context() -> TrainContext:
+    """Get or create a singleton training context.
+    The context is only available within a function passed to Ray Train.
+    See the :class:`~ray.train.TrainContext` API reference to see available methods.
+    """
+    from ray.tune.trainable.trainable_fn_utils import _in_tune_session
+    # If we are running in a Tune function, switch to Tune context.
+    if _in_tune_session():
+        from ray.tune import get_context as get_tune_context
+        if _v2_migration_warnings_enabled():
+            _log_deprecation_warning(
+                "`ray.train.get_context()` should be switched to "
+                "`ray.tune.get_context()` when running in a function "
+                "passed to Ray Tune. This will be an error in the future."
+            )
+        return get_tune_context()
+    global _default_context
+    with _context_lock:
+        if _default_context is None:
+            _default_context = TrainContext()
+        return _default_context

.venv/lib/python3.11/site-packages/ray/train/data_parallel_trainer.py ADDED Viewed

	@@ -0,0 +1,587 @@

+import logging
+import uuid
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+import ray
+from ray._private.ray_constants import env_integer
+from ray._private.thirdparty.tabulate.tabulate import tabulate
+from ray.air.config import RunConfig, ScalingConfig
+from ray.train import BackendConfig, Checkpoint, TrainingIterator
+from ray.train._internal import session
+from ray.train._internal.backend_executor import BackendExecutor, TrialInfo
+from ray.train._internal.data_config import DataConfig
+from ray.train._internal.session import _TrainingResult, get_session
+from ray.train._internal.utils import construct_train_func, count_required_parameters
+from ray.train.constants import RAY_TRAIN_ENABLE_STATE_TRACKING
+from ray.train.trainer import BaseTrainer, GenDataset
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.widgets import Template
+from ray.widgets.util import repr_with_fallback
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class DataParallelTrainer(BaseTrainer):
+    """A Trainer for data parallel training.
+    You should subclass this Trainer if your Trainer follows SPMD (single program,
+    multiple data) programming paradigm - you want multiple processes to run the same
+    function, but on different data.
+    This Trainer runs the function ``train_loop_per_worker`` on multiple Ray
+    Actors.
+    The ``train_loop_per_worker`` function is expected to take in either 0 or 1
+    arguments:
+    .. testcode::
+        def train_loop_per_worker():
+            ...
+    .. testcode::
+        def train_loop_per_worker(config: Dict):
+            ...
+    If ``train_loop_per_worker`` accepts an argument, then
+    ``train_loop_config`` will be passed in as the argument. This is useful if you
+    want to tune the values in ``train_loop_config`` as hyperparameters.
+    If the ``datasets`` dict contains a training dataset (denoted by
+    the "train" key), then it will be split into multiple dataset
+    shards that can then be accessed by ``train.get_dataset_shard("train")`` inside
+    ``train_loop_per_worker``. All the other datasets will not be split and
+    ``train.get_dataset_shard(...)`` will return the the entire Dataset.
+    Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray Train loop methods <train-loop-api>`.
+    .. testcode::
+        from ray import train
+        def train_loop_per_worker():
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            train.report(...)
+            # Returns dict of last saved checkpoint.
+            train.get_checkpoint()
+            # Returns the Dataset shard for the given key.
+            train.get_dataset_shard("my_dataset")
+            # Returns the total number of workers executing training.
+            train.get_context().get_world_size()
+            # Returns the rank of this worker.
+            train.get_context().get_world_rank()
+            # Returns the rank of the worker on the current node.
+            train.get_context().get_local_rank()
+    Any returns from the ``train_loop_per_worker`` will be discarded and not
+    used or persisted anywhere.
+    **How do I use DataParallelTrainer or any of its subclasses?**
+    Example:
+    .. testcode::
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.data_parallel_trainer import DataParallelTrainer
+        def train_loop_for_worker():
+            dataset_shard_for_this_worker = train.get_dataset_shard("train")
+            # 3 items for 3 workers, each worker gets 1 item
+            batches = list(dataset_shard_for_this_worker.iter_batches(batch_size=1))
+            assert len(batches) == 1
+        train_dataset = ray.data.from_items([1, 2, 3])
+        assert train_dataset.count() == 3
+        trainer = DataParallelTrainer(
+            train_loop_for_worker,
+            scaling_config=ScalingConfig(num_workers=3),
+            datasets={"train": train_dataset},
+        )
+        result = trainer.fit()
+    .. testoutput::
+            :hide:
+            ...
+    **How do I develop on top of DataParallelTrainer?**
+    In many cases, using DataParallelTrainer directly is sufficient to execute
+    functions on multiple actors.
+    However, you may want to subclass ``DataParallelTrainer`` and create a custom
+    Trainer for the following 2 use cases:
+      - **Use Case 1:** You want to do data parallel training, but want to have
+        a predefined ``training_loop_per_worker``.
+      - **Use Case 2:** You want to implement a custom
+        :py:class:`~ray.train.backend.Backend` that automatically handles
+        additional setup or teardown logic on each actor, so that the users of this
+        new trainer do not have to implement this logic. For example, a
+        ``TensorflowTrainer`` can be built on top of ``DataParallelTrainer``
+        that automatically handles setting the proper environment variables for
+        distributed Tensorflow on each actor.
+    For 1, you can set a predefined training loop in __init__
+    .. testcode::
+        from ray.train.data_parallel_trainer import DataParallelTrainer
+        class MyDataParallelTrainer(DataParallelTrainer):
+            def __init__(self, *args, **kwargs):
+                predefined_train_loop_per_worker = lambda: 1
+                super().__init__(predefined_train_loop_per_worker, *args, **kwargs)
+    For 2, you can implement the ``ray.train.Backend`` and ``ray.train.BackendConfig``
+    interfaces.
+    .. testcode::
+        from dataclasses import dataclass
+        from ray.train.backend import Backend, BackendConfig
+        class MyBackend(Backend):
+            def on_start(self, worker_group, backend_config):
+                def set_env_var(env_var_value):
+                    import os
+                    os.environ["MY_ENV_VAR"] = env_var_value
+                worker_group.execute(set_env_var, backend_config.env_var)
+        @dataclass
+        class MyBackendConfig(BackendConfig):
+            env_var: str = "default_value"
+            def backend_cls(self):
+                return MyBackend
+        class MyTrainer(DataParallelTrainer):
+            def __init__(self, train_loop_per_worker, my_backend_config:
+                MyBackendConfig, **kwargs):
+                super().__init__(
+                    train_loop_per_worker,
+                    backend_config=my_backend_config, **kwargs)
+    Args:
+        train_loop_per_worker: The training function to execute.
+            This can either take in no arguments or a ``config`` dict.
+        train_loop_config: Configurations to pass into
+            ``train_loop_per_worker`` if it accepts an argument.
+        backend_config: Configuration for setting up a Backend (e.g. Torch,
+            Tensorflow, Horovod) on each worker to enable distributed
+            communication. If no Backend should be set up, then set this to None.
+        scaling_config: Configuration for how to scale data parallel training.
+        dataset_config: Configuration for dataset ingest. This is merged with the
+            default dataset config for the given trainer (`cls._dataset_config`).
+        run_config: Configuration for the execution of the training run.
+        datasets: Ray Datasets to use for training and evaluation.
+            This is a dict where the key is the name of the dataset, which
+            can be accessed from within the ``train_loop_per_worker`` by calling
+            ``train.get_dataset_shard(dataset_key)``.
+            By default, all datasets are sharded equally across workers.
+            This can be configured via ``dataset_config``.
+        metadata: Dict that should be made available via
+            `train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+        resume_from_checkpoint: A checkpoint to resume training from.
+    """
+    # Exposed here for testing purposes. Should never need
+    # to be overriden.
+    _backend_executor_cls: Type[BackendExecutor] = BackendExecutor
+    _training_iterator_cls: Type[TrainingIterator] = TrainingIterator
+    _scaling_config_allowed_keys = BaseTrainer._scaling_config_allowed_keys + [
+        "num_workers",
+        "resources_per_worker",
+        "use_gpu",
+        "placement_strategy",
+        "accelerator_type",
+    ]
+    # For backwards compatibility with the legacy dataset config API.
+    _dataset_config = None
+    _fields_for_tuner_param_space = BaseTrainer._fields_for_tuner_param_space + [
+        "train_loop_config"
+    ]
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        backend_config: Optional[BackendConfig] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        dataset_config: Optional[DataConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        self._train_loop_per_worker = train_loop_per_worker
+        self._train_loop_config = train_loop_config
+        if dataset_config is None:
+            dataset_config = DataConfig()
+        if not isinstance(dataset_config, DataConfig):
+            raise ValueError(
+                "`dataset_config` must be an instance of ray.train.DataConfig, "
+                f"was: {dataset_config}"
+            )
+        self._data_config = dataset_config
+        backend_config = (
+            backend_config if backend_config is not None else BackendConfig()
+        )
+        self._backend_config = backend_config
+        super(DataParallelTrainer, self).__init__(
+            scaling_config=scaling_config,
+            run_config=run_config,
+            datasets=datasets,
+            metadata=metadata,
+            resume_from_checkpoint=resume_from_checkpoint,
+        )
+        train_total_resources = self.scaling_config.total_resources
+        self._data_config.set_train_total_resources(
+            train_total_resources.get("CPU", 0),
+            train_total_resources.get("GPU", 0),
+        )
+        if env_integer(RAY_TRAIN_ENABLE_STATE_TRACKING, 0):
+            from ray.train._internal.state.state_actor import get_or_create_state_actor
+            get_or_create_state_actor()
+    @PublicAPI(stability="beta")
+    @classmethod
+    def restore(
+        cls: Type["DataParallelTrainer"],
+        path: str,
+        train_loop_per_worker: Optional[
+            Union[Callable[[], None], Callable[[Dict], None]]
+        ] = None,
+        train_loop_config: Optional[Dict] = None,
+        **kwargs,
+    ) -> "DataParallelTrainer":
+        """Restores a DataParallelTrainer from a previously interrupted/failed run.
+        Args:
+            train_loop_per_worker: Optionally re-specified train loop function.
+                This should be used to re-specify a function that is not
+                restorable in a new Ray cluster (e.g., it holds onto outdated
+                object references). This should be the same training loop
+                that was passed to the original trainer constructor.
+            train_loop_config: Optionally re-specified train config.
+                This should similarly be used if the original `train_loop_config`
+                contained outdated object references, and it should not be modified
+                from what was originally passed in.
+        See :meth:`BaseTrainer.restore() <ray.train.trainer.BaseTrainer.restore>`
+        for descriptions of the other arguments.
+        Returns:
+            DataParallelTrainer: A restored instance of the `DataParallelTrainer`
+            subclass that is calling this method.
+        """
+        return super(DataParallelTrainer, cls).restore(
+            path=path,
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            **kwargs,
+        )
+    def _validate_attributes(self):
+        super()._validate_attributes()
+        self._validate_train_loop_per_worker(
+            self._train_loop_per_worker, "train_loop_per_worker"
+        )
+    def _validate_train_loop_per_worker(
+        self, train_loop_per_worker: Callable, fn_name: str
+    ) -> None:
+        num_required_params = count_required_parameters(train_loop_per_worker)
+        if num_required_params > 1:
+            raise ValueError(
+                f"{fn_name} should take in 0 or 1 arguments, "
+                f"but it accepts {num_required_params} arguments instead."
+            )
+    @classmethod
+    def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig:
+        scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
+            scaling_config
+        )
+        # This validation happens after the scaling config is updated from
+        # its specification in the Tuner `param_space`
+        if not scaling_config.use_gpu and "GPU" in ray.available_resources():
+            logger.info(
+                "GPUs are detected in your Ray cluster, but GPU "
+                "training is not enabled for this trainer. To enable "
+                "GPU training, make sure to set `use_gpu` to True "
+                "in your scaling config."
+            )
+        if scaling_config.num_workers is None:
+            raise ValueError(
+                "You must specify the 'num_workers' in `scaling_config` as either an "
+                f"argument of `{cls.__name__}` or through the `param_space` of a "
+                "`Tuner` (if performing hyperparameter tuning)."
+            )
+        if scaling_config.num_workers <= 0:
+            raise ValueError(
+                "'num_workers' in `scaling_config` must be a positive "
+                f"integer. Received {scaling_config.num_workers}"
+            )
+        return scaling_config
+    def _run_training(self, training_iterator: TrainingIterator) -> None:
+        """This method loops over the `TrainingIterator`:
+        The actual iteration (for ... in ...) waits for the training function
+        on each worker to report a result and supplies it as a list of results.
+        Afterwards (in the body of the loop), it will report the result
+        to the Tune session.
+        The iterator ends after the training function on each worker has finished.
+        """
+        for training_results in training_iterator:
+            # TODO(ml-team): add ability to report results from multiple workers.
+            self._propagate_results(training_results)
+    def _propagate_results(self, training_results: List[_TrainingResult]):
+        first_worker_result = training_results[0]
+        assert all(isinstance(result, _TrainingResult) for result in training_results)
+        tune_session = get_session()
+        # Check if any workers reported a checkpoint.
+        # If so, report a checkpoint pointing to the persisted location
+        # to Tune for book-keeping.
+        # NOTE: This removes the restriction for any individual worker
+        # (ex: global rank 0 worker) from needing to report a checkpoint.
+        # All workers reported a checkpoint to the same fs path, so there's
+        # no need to report multiple checkpoints to Tune.
+        worker_checkpoints = [
+            result.checkpoint
+            for result in training_results
+            if result.checkpoint is not None
+        ]
+        at_least_one_reported_checkpoint = len(worker_checkpoints) > 0
+        if at_least_one_reported_checkpoint:
+            # Update the coordinator's checkpoint index to the latest.
+            # This is what keeps the checkpoint index in line with the workers.
+            tune_session.storage._update_checkpoint_index(first_worker_result.metrics)
+        # Make sure that all workers uploaded to the same location.
+        assert all(
+            checkpoint.path == tune_session.storage.checkpoint_fs_path
+            for checkpoint in worker_checkpoints
+        )
+        checkpoint = (
+            Checkpoint(
+                filesystem=tune_session.storage.storage_filesystem,
+                path=tune_session.storage.checkpoint_fs_path,
+            )
+            if at_least_one_reported_checkpoint
+            else None
+        )
+        tracked_training_result = _TrainingResult(
+            checkpoint=checkpoint,
+            metrics=first_worker_result.metrics,
+        )
+        logger.debug(
+            "Report (metrics, checkpoint) to the Tune session:\n"
+            f"  metrics={tracked_training_result.metrics}\n"
+            f"  checkpoint={tracked_training_result.checkpoint}"
+        )
+        # Report the metrics and checkpoint to Tune.
+        tune_session._report_training_result(tracked_training_result)
+    def training_loop(self) -> None:
+        scaling_config = self._validate_scaling_config(self.scaling_config)
+        train_loop_per_worker = construct_train_func(
+            self._train_loop_per_worker,
+            self._train_loop_config,
+            train_func_context=self._backend_config.train_func_context,
+            fn_arg_name="train_loop_per_worker",
+            discard_returns=True,
+        )
+        trial_info = TrialInfo(
+            name=session.get_trial_name(),
+            id=session.get_trial_id(),
+            resources=session.get_trial_resources(),
+            logdir=session.get_trial_dir(),
+            driver_ip=ray.util.get_node_ip_address(),
+            driver_node_id=ray.get_runtime_context().get_node_id(),
+            experiment_name=session.get_experiment_name(),
+            run_id=uuid.uuid4().hex,
+        )
+        backend_executor = self._backend_executor_cls(
+            backend_config=self._backend_config,
+            trial_info=trial_info,
+            num_workers=scaling_config.num_workers,
+            resources_per_worker=scaling_config._resources_per_worker_not_none,
+            max_retries=0,
+        )
+        # Start the remote actors.
+        backend_executor.start()
+        training_iterator = self._training_iterator_cls(
+            backend_executor=backend_executor,
+            backend_config=self._backend_config,
+            train_func=train_loop_per_worker,
+            datasets=self.datasets,
+            metadata=self.metadata,
+            data_config=self._data_config,
+            checkpoint=self.starting_checkpoint,
+        )
+        self._run_training(training_iterator)
+        # Shutdown workers.
+        backend_executor.shutdown()
+    def get_dataset_config(self) -> DataConfig:
+        """Returns a copy of this Trainer's final dataset configs.
+        Returns:
+            The merged default + user-supplied dataset config.
+        """
+        return self._data_config
+    @repr_with_fallback(["ipywidgets", "8"])
+    def _repr_mimebundle_(self, **kwargs):
+        """Returns a mimebundle with an ipywidget repr and a simple text repr.
+        Depending on the frontend where the data is being displayed,
+        different mimetypes will be used from this bundle.
+        See https://ipython.readthedocs.io/en/stable/config/integrating.html
+        for information about this method, and
+        https://ipywidgets.readthedocs.io/en/latest/embedding.html
+        for more information about the jupyter widget mimetype.
+        Returns:
+            A mimebundle containing an ipywidget repr and a simple text repr.
+        """
+        from ipywidgets import HTML, Layout, Tab, VBox
+        title = HTML(f"<h2>{self.__class__.__name__}</h2>")
+        children = []
+        titles = []
+        if self.datasets:
+            children.append(self._datasets_repr_())
+            titles.append("Datasets")
+            children.append(HTML(self._data_config_repr_html_()))
+            titles.append("Data Config")
+        if self._train_loop_config:
+            children.append(HTML(self._train_loop_config_repr_html_()))
+            titles.append("Train Loop Config")
+        if self.scaling_config:
+            children.append(HTML(self.scaling_config._repr_html_()))
+            titles.append("Scaling Config")
+        if self.run_config:
+            children.append(HTML(self.run_config._repr_html_()))
+            titles.append("Run Config")
+        if self._backend_config:
+            children.append(HTML(self._backend_config._repr_html_()))
+            titles.append("Backend Config")
+        tab = Tab(children, titles=titles)
+        widget = VBox([title, tab], layout=Layout(width="100%"))
+        bundle = widget._repr_mimebundle_(**kwargs)
+        bundle.update(
+            {
+                "text/plain": repr(self),
+            }
+        )
+        return bundle
+    def _train_loop_config_repr_html_(self) -> str:
+        if self._train_loop_config:
+            table_data = {}
+            for k, v in self._train_loop_config.items():
+                if isinstance(v, str) or str(v).isnumeric():
+                    table_data[k] = v
+                elif hasattr(v, "_repr_html_"):
+                    table_data[k] = v._repr_html_()
+                else:
+                    table_data[k] = str(v)
+            return Template("title_data.html.j2").render(
+                title="Train Loop Config",
+                data=Template("scrollableTable.html.j2").render(
+                    table=tabulate(
+                        table_data.items(),
+                        headers=["Setting", "Value"],
+                        showindex=False,
+                        tablefmt="unsafehtml",
+                    ),
+                    max_height="none",
+                ),
+            )
+        else:
+            return ""
+    def _data_config_repr_html_(self) -> str:
+        # TODO make this rendering nicer.
+        content = [str(self._data_config)]
+        return Template("rendered_html_common.html.j2").render(content=content)
+    def _datasets_repr_(self) -> str:
+        from ipywidgets import HTML, Layout, VBox
+        content = []
+        if self.datasets:
+            for name, config in self.datasets.items():
+                tab = config._tab_repr_()
+                if tab:
+                    content.append(
+                        HTML(
+                            Template("title_data.html.j2").render(
+                                title=f"Dataset - <code>{name}</code>", data=None
+                            )
+                        )
+                    )
+                    content.append(config._tab_repr_())
+        return VBox(content, layout=Layout(width="100%"))

.venv/lib/python3.11/site-packages/ray/train/error.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from ray.util.annotations import PublicAPI
+@PublicAPI(stability="beta")
+class SessionMisuseError(Exception):
+    """Indicates a method or function was used outside of a session."""

.venv/lib/python3.11/site-packages/ray/train/examples/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/examples/mlflow_simple_example.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from pathlib import Path
+from ray import train
+from ray.train import RunConfig, ScalingConfig
+from ray.train.torch import TorchTrainer
+from ray.tune.logger import TBXLoggerCallback
+from ray.tune.logger.mlflow import MLflowLoggerCallback
+def train_func():
+    for i in range(3):
+        train.report(dict(epoch=i))
+trainer = TorchTrainer(
+    train_func,
+    scaling_config=ScalingConfig(num_workers=2),
+    run_config=RunConfig(
+        callbacks=[
+            MLflowLoggerCallback(experiment_name="train_experiment"),
+            TBXLoggerCallback(),
+        ],
+    ),
+)
+# Run the training function, logging all the intermediate results
+# to MLflow and Tensorboard.
+result = trainer.fit()
+# For MLFLow logs:
+# MLFlow logs will by default be saved in an `mlflow` directory
+# in the current working directory.
+# $ cd mlflow
+# # View the MLflow UI.
+# $ mlflow ui
+# You can change the directory by setting the `tracking_uri` argument
+# in `MLflowLoggerCallback`.
+# For TensorBoard logs:
+# Print the latest run directory and keep note of it.
+# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06
+print("Run directory:", Path(result.path).parent)  # TensorBoard is saved in parent dir
+# How to visualize the logs
+# Navigate to the run directory of the trainer.
+# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06`
+# $ cd <TRAINER_RUN_DIR>
+#
+# # View the tensorboard UI.
+# $ tensorboard --logdir .

.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_autoencoder_example.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import argparse
+import ray
+from ray import tune
+from ray.train import ScalingConfig
+from ray.train.examples.tf.tensorflow_mnist_example import train_func
+from ray.train.tensorflow import TensorflowTrainer
+from ray.tune.tune_config import TuneConfig
+from ray.tune.tuner import Tuner
+def tune_tensorflow_mnist(
+    num_workers: int = 2, num_samples: int = 2, use_gpu: bool = False
+):
+    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)
+    trainer = TensorflowTrainer(
+        train_loop_per_worker=train_func,
+        scaling_config=scaling_config,
+    )
+    tuner = Tuner(
+        trainer,
+        tune_config=TuneConfig(
+            num_samples=num_samples, metric="binary_crossentropy", mode="min"
+        ),
+        param_space={
+            "train_loop_config": {
+                "lr": tune.loguniform(1e-4, 1e-1),
+                "batch_size": tune.choice([32, 64, 128]),
+                "epochs": 3,
+            }
+        },
+    )
+    best_accuracy = tuner.fit().get_best_result().metrics["binary_crossentropy"]
+    print(f"Best accuracy config: {best_accuracy}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--smoke-test",
+        action="store_true",
+        default=False,
+        help="Finish quickly for testing.",
+    )
+    parser.add_argument(
+        "--address", required=False, type=str, help="the address to use for Ray"
+    )
+    parser.add_argument(
+        "--num-workers",
+        "-n",
+        type=int,
+        default=2,
+        help="Sets number of workers for training.",
+    )
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        default=2,
+        help="Sets number of samples for training.",
+    )
+    parser.add_argument(
+        "--use-gpu", action="store_true", default=False, help="Enables GPU training"
+    )
+    args = parser.parse_args()
+    if args.smoke_test:
+        num_gpus = args.num_workers if args.use_gpu else 0
+        ray.init(num_cpus=8, num_gpus=num_gpus)
+        tune_tensorflow_mnist(num_workers=2, num_samples=2, use_gpu=args.use_gpu)
+    else:
+        ray.init(address=args.address)
+        tune_tensorflow_mnist(
+            num_workers=args.num_workers,
+            num_samples=args.num_samples,
+            use_gpu=args.use_gpu,
+        )

.venv/lib/python3.11/site-packages/ray/train/huggingface/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/huggingface/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (194 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from ray.train.huggingface.transformers._transformers_utils import (
+    RayTrainReportCallback,
+    prepare_trainer,
+)
+__all__ = [
+    "RayTrainReportCallback",
+    "prepare_trainer",
+]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (410 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/_transformers_utils.cpython-311.pyc ADDED Viewed

Binary file (8.74 kB). View file

.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/_transformers_utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import logging
+import shutil
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Iterator, Optional, Type
+from torch.utils.data import DataLoader, Dataset, IterableDataset
+import ray
+from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
+from ray.data.iterator import _IterableFromIterator
+from ray.train import Checkpoint
+from ray.util import PublicAPI
+logger = logging.getLogger(__name__)
+TRANSFORMERS_IMPORT_ERROR: Optional[ImportError] = None
+try:
+    import transformers.trainer
+    from transformers import Trainer
+    from transformers.trainer_callback import TrainerCallback
+except ImportError as e:
+    TRANSFORMERS_IMPORT_ERROR = e
+    TrainerCallback = object
+@PublicAPI(stability="beta")
+class RayTrainReportCallback(TrainerCallback):
+    """A simple callback to report checkpoints and metrics to Ray Train.
+    This callback is a subclass of `transformers.TrainerCallback
+    <https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerCallback>`_
+    and overrides the `TrainerCallback.on_save()` method. After
+    a new checkpoint get saved, it fetches the latest metric dictionary
+    from `TrainerState.log_history` and reports it with the latest checkpoint
+    to Ray Train.
+    Checkpoints will be saved in the following structure::
+        checkpoint_00000*/   Ray Train Checkpoint
+        └─ checkpoint/       Hugging Face Transformers Checkpoint
+    For customized reporting and checkpointing logic, implement your own
+    `transformers.TrainerCallback` following this user
+    guide: :ref:`Saving and Loading Checkpoints <train-dl-saving-checkpoints>`.
+    Note that users should ensure that the logging, evaluation, and saving frequencies
+    are properly configured so that the monitoring metric is always up-to-date
+    when `transformers.Trainer` saves a checkpoint.
+    Suppose the monitoring metric is reported from evaluation stage:
+    Some valid configurations:
+        - evaluation_strategy == save_strategy == "epoch"
+        - evaluation_strategy == save_strategy == "steps", save_steps % eval_steps == 0
+    Some invalid configurations:
+        - evaluation_strategy != save_strategy
+        - evaluation_strategy == save_strategy == "steps", save_steps % eval_steps != 0
+    """
+    CHECKPOINT_NAME = "checkpoint"
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        record_extra_usage_tag(TagKey.TRAIN_TRANSFORMERS_RAYTRAINREPORTCALLBACK, "1")
+    def on_save(self, args, state, control, **kwargs):
+        """Event called after a checkpoint save."""
+        with TemporaryDirectory() as tmpdir:
+            # Aggregate all the logged metrics
+            metrics = {}
+            for log in state.log_history:
+                metrics.update(log)
+            # Copy ckpt files and construct a Ray Train Checkpoint
+            source_ckpt_path = transformers.trainer.get_last_checkpoint(args.output_dir)
+            if source_ckpt_path is not None:
+                target_ckpt_path = Path(tmpdir, self.CHECKPOINT_NAME).as_posix()
+                shutil.copytree(source_ckpt_path, target_ckpt_path)
+                checkpoint = Checkpoint.from_directory(tmpdir)
+            else:
+                checkpoint = None
+            # Report latest metrics and checkpoint to Ray Train
+            ray.train.report(metrics=metrics, checkpoint=checkpoint)
+class RayTorchIterableDataset(IterableDataset):
+    """Wrapper class for ray data iterables."""
+    def __init__(self, data_iterable) -> None:
+        super().__init__()
+        self.data_iterable = data_iterable
+    def __iter__(self) -> Iterator:
+        return iter(self.data_iterable)
+@PublicAPI(stability="beta")
+def prepare_trainer(trainer: "Trainer") -> "Trainer":
+    """Prepare your HuggingFace Transformer Trainer for Ray Train.
+    This utility function enable the trainer integrates with Ray Data Integration.
+    Internally, it overrides the `get_train_dataloader` and `get_eval_dataloader`
+    methods and inject the data integration logics if the `train_dataset` and
+    `eval_dataset` are Ray Data Iterables.
+    """
+    if TRANSFORMERS_IMPORT_ERROR is not None:
+        raise TRANSFORMERS_IMPORT_ERROR
+    base_trainer_class: Type[transformers.trainer.Trainer] = trainer.__class__
+    class RayTransformersTrainer(base_trainer_class):
+        """A Wrapper of `transformers.Trainer` for Ray Data Integration."""
+        def get_train_dataloader(self) -> DataLoader:
+            if isinstance(self.train_dataset, _IterableFromIterator):
+                dataset = RayTorchIterableDataset(self.train_dataset)
+                return DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0])
+            else:
+                return super().get_train_dataloader()
+        def get_eval_dataloader(
+            self, eval_dataset: Optional[Dataset] = None
+        ) -> DataLoader:
+            if eval_dataset is None:
+                eval_dataset = self.eval_dataset
+            if isinstance(eval_dataset, _IterableFromIterator):
+                dataset = RayTorchIterableDataset(eval_dataset)
+                return DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0])
+            else:
+                return super().get_eval_dataloader(eval_dataset)
+    trainer.__class__ = RayTransformersTrainer
+    record_extra_usage_tag(TagKey.TRAIN_TRANSFORMERS_PREPARE_TRAINER, "1")
+    return trainer

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from ray.train.lightgbm._lightgbm_utils import RayTrainReportCallback
+from ray.train.lightgbm.lightgbm_checkpoint import LightGBMCheckpoint
+from ray.train.lightgbm.lightgbm_predictor import LightGBMPredictor
+from ray.train.lightgbm.lightgbm_trainer import LightGBMTrainer
+from ray.train.v2._internal.constants import is_v2_enabled
+if is_v2_enabled():
+    from ray.train.v2.lightgbm.lightgbm_trainer import LightGBMTrainer  # noqa: F811
+__all__ = [
+    "RayTrainReportCallback",
+    "LightGBMCheckpoint",
+    "LightGBMPredictor",
+    "LightGBMTrainer",
+]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (814 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/_lightgbm_utils.cpython-311.pyc ADDED Viewed

Binary file (8.97 kB). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (5.3 kB). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_checkpoint.cpython-311.pyc ADDED Viewed

Binary file (4.03 kB). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_predictor.cpython-311.pyc ADDED Viewed

Binary file (7.27 kB). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_trainer.cpython-311.pyc ADDED Viewed

Binary file (11 kB). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/v2.cpython-311.pyc ADDED Viewed

Binary file (6.73 kB). View file

.venv/lib/python3.11/site-packages/ray/train/lightgbm/_lightgbm_utils.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+from lightgbm.basic import Booster
+from lightgbm.callback import CallbackEnv
+import ray.train
+from ray.train import Checkpoint
+from ray.tune.utils import flatten_dict
+from ray.util.annotations import PublicAPI
+@PublicAPI(stability="beta")
+class RayTrainReportCallback:
+    """Creates a callback that reports metrics and checkpoints model.
+    Args:
+        metrics: Metrics to report. If this is a list,
+            each item should be a metric key reported by LightGBM,
+            and it will be reported to Ray Train/Tune under the same name.
+            This can also be a dict of {<key-to-report>: <lightgbm-metric-key>},
+            which can be used to rename LightGBM default metrics.
+        filename: Customize the saved checkpoint file type by passing
+            a filename. Defaults to "model.txt".
+        frequency: How often to save checkpoints, in terms of iterations.
+            Defaults to 0 (no checkpoints are saved during training).
+        checkpoint_at_end: Whether or not to save a checkpoint at the end of training.
+        results_postprocessing_fn: An optional Callable that takes in
+            the metrics dict that will be reported (after it has been flattened)
+            and returns a modified dict.
+    Examples
+    --------
+    Reporting checkpoints and metrics to Ray Tune when running many
+    independent xgboost trials (without data parallelism within a trial).
+    .. testcode::
+        :skipif: True
+        import lightgbm
+        from ray.train.lightgbm import RayTrainReportCallback
+        config = {
+            # ...
+            "metric": ["binary_logloss", "binary_error"],
+        }
+        # Report only log loss to Tune after each validation epoch.
+        bst = lightgbm.train(
+            ...,
+            callbacks=[
+                RayTrainReportCallback(
+                    metrics={"loss": "eval-binary_logloss"}, frequency=1
+                )
+            ],
+        )
+    Loading a model from a checkpoint reported by this callback.
+    .. testcode::
+        :skipif: True
+        from ray.train.lightgbm import RayTrainReportCallback
+        # Get a `Checkpoint` object that is saved by the callback during training.
+        result = trainer.fit()
+        booster = RayTrainReportCallback.get_model(result.checkpoint)
+    """
+    CHECKPOINT_NAME = "model.txt"
+    def __init__(
+        self,
+        metrics: Optional[Union[str, List[str], Dict[str, str]]] = None,
+        filename: str = CHECKPOINT_NAME,
+        frequency: int = 0,
+        checkpoint_at_end: bool = True,
+        results_postprocessing_fn: Optional[
+            Callable[[Dict[str, Union[float, List[float]]]], Dict[str, float]]
+        ] = None,
+    ):
+        if isinstance(metrics, str):
+            metrics = [metrics]
+        self._metrics = metrics
+        self._filename = filename
+        self._frequency = frequency
+        self._checkpoint_at_end = checkpoint_at_end
+        self._results_postprocessing_fn = results_postprocessing_fn
+    @classmethod
+    def get_model(
+        cls, checkpoint: Checkpoint, filename: str = CHECKPOINT_NAME
+    ) -> Booster:
+        """Retrieve the model stored in a checkpoint reported by this callback.
+        Args:
+            checkpoint: The checkpoint object returned by a training run.
+                The checkpoint should be saved by an instance of this callback.
+            filename: The filename to load the model from, which should match
+                the filename used when creating the callback.
+        """
+        with checkpoint.as_directory() as checkpoint_path:
+            return Booster(model_file=Path(checkpoint_path, filename).as_posix())
+    def _get_report_dict(self, evals_log: Dict[str, Dict[str, list]]) -> dict:
+        result_dict = flatten_dict(evals_log, delimiter="-")
+        if not self._metrics:
+            report_dict = result_dict
+        else:
+            report_dict = {}
+            for key in self._metrics:
+                if isinstance(self._metrics, dict):
+                    metric = self._metrics[key]
+                else:
+                    metric = key
+                report_dict[key] = result_dict[metric]
+        if self._results_postprocessing_fn:
+            report_dict = self._results_postprocessing_fn(report_dict)
+        return report_dict
+    def _get_eval_result(self, env: CallbackEnv) -> dict:
+        eval_result = {}
+        for entry in env.evaluation_result_list:
+            data_name, eval_name, result = entry[0:3]
+            if len(entry) > 4:
+                stdv = entry[4]
+                suffix = "-mean"
+            else:
+                stdv = None
+                suffix = ""
+            if data_name not in eval_result:
+                eval_result[data_name] = {}
+            eval_result[data_name][eval_name + suffix] = result
+            if stdv is not None:
+                eval_result[data_name][eval_name + "-stdv"] = stdv
+        return eval_result
+    @contextmanager
+    def _get_checkpoint(self, model: Booster) -> Optional[Checkpoint]:
+        if ray.train.get_context().get_world_rank() in (0, None):
+            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                model.save_model(Path(temp_checkpoint_dir, self._filename).as_posix())
+                yield Checkpoint.from_directory(temp_checkpoint_dir)
+        else:
+            yield None
+    def __call__(self, env: CallbackEnv) -> None:
+        eval_result = self._get_eval_result(env)
+        report_dict = self._get_report_dict(eval_result)
+        # Ex: if frequency=2, checkpoint_at_end=True and num_boost_rounds=11,
+        # you will checkpoint at iterations 1, 3, 5, ..., 9, and 10 (checkpoint_at_end)
+        # (iterations count from 0)
+        on_last_iter = env.iteration == env.end_iteration - 1
+        should_checkpoint_at_end = on_last_iter and self._checkpoint_at_end
+        should_checkpoint_with_frequency = (
+            self._frequency != 0 and (env.iteration + 1) % self._frequency == 0
+        )
+        should_checkpoint = should_checkpoint_at_end or should_checkpoint_with_frequency
+        if should_checkpoint:
+            with self._get_checkpoint(model=env.model) as checkpoint:
+                ray.train.report(report_dict, checkpoint=checkpoint)
+        else:
+            ray.train.report(report_dict)

.venv/lib/python3.11/site-packages/ray/train/lightgbm/config.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import logging
+import threading
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import ray
+from ray.train._internal.utils import get_address_and_port
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import Backend, BackendConfig
+logger = logging.getLogger(__name__)
+# Global LightGBM distributed network configuration for each worker process.
+_lightgbm_network_params: Optional[Dict[str, Any]] = None
+_lightgbm_network_params_lock = threading.Lock()
+def get_network_params() -> Dict[str, Any]:
+    """Returns the network parameters to enable LightGBM distributed training."""
+    global _lightgbm_network_params
+    with _lightgbm_network_params_lock:
+        if not _lightgbm_network_params:
+            logger.warning(
+                "`ray.train.lightgbm.get_network_params` was called outside "
+                "the context of a `ray.train.lightgbm.LightGBMTrainer`. "
+                "The current process has no knowledge of the distributed training "
+                "worker group, so this method will return an empty dict. "
+                "Please call this within the training loop of a "
+                "`ray.train.lightgbm.LightGBMTrainer`. "
+                "If you are in fact calling this within a `LightGBMTrainer`, "
+                "this is unexpected: please file a bug report to the Ray Team."
+            )
+            return {}
+        return _lightgbm_network_params.copy()
+def _set_network_params(
+    num_machines: int,
+    local_listen_port: int,
+    machines: str,
+):
+    global _lightgbm_network_params
+    with _lightgbm_network_params_lock:
+        assert (
+            _lightgbm_network_params is None
+        ), "LightGBM network params are already initialized."
+        _lightgbm_network_params = dict(
+            num_machines=num_machines,
+            local_listen_port=local_listen_port,
+            machines=machines,
+        )
+@dataclass
+class LightGBMConfig(BackendConfig):
+    """Configuration for LightGBM distributed data-parallel training setup.
+    See the LightGBM docs for more information on the "network parameters"
+    that Ray Train sets up for you:
+    https://lightgbm.readthedocs.io/en/latest/Parameters.html#network-parameters
+    """
+    @property
+    def backend_cls(self):
+        return _LightGBMBackend
+class _LightGBMBackend(Backend):
+    def on_training_start(
+        self, worker_group: WorkerGroup, backend_config: LightGBMConfig
+    ):
+        node_ips_and_ports = worker_group.execute(get_address_and_port)
+        ports = [port for _, port in node_ips_and_ports]
+        machines = ",".join(
+            [f"{node_ip}:{port}" for node_ip, port in node_ips_and_ports]
+        )
+        num_machines = len(worker_group)
+        ray.get(
+            [
+                worker_group.execute_single_async(
+                    rank, _set_network_params, num_machines, ports[rank], machines
+                )
+                for rank in range(len(worker_group))
+            ]
+        )

.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_checkpoint.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+import lightgbm
+from ray.train._internal.framework_checkpoint import FrameworkCheckpoint
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    from ray.data.preprocessor import Preprocessor
+@PublicAPI(stability="beta")
+class LightGBMCheckpoint(FrameworkCheckpoint):
+    """A :py:class:`~ray.train.Checkpoint` with LightGBM-specific functionality."""
+    MODEL_FILENAME = "model.txt"
+    @classmethod
+    def from_model(
+        cls,
+        booster: lightgbm.Booster,
+        *,
+        preprocessor: Optional["Preprocessor"] = None,
+        path: Optional[str] = None,
+    ) -> "LightGBMCheckpoint":
+        """Create a :py:class:`~ray.train.Checkpoint` that stores a LightGBM model.
+        Args:
+            booster: The LightGBM model to store in the checkpoint.
+            preprocessor: A fitted preprocessor to be applied before inference.
+            path: The path to the directory where the checkpoint file will be saved.
+                This should start as an empty directory, since the *entire*
+                directory will be treated as the checkpoint when reported.
+                By default, a temporary directory will be created.
+        Returns:
+            An :py:class:`LightGBMCheckpoint` containing the specified ``Estimator``.
+        Examples:
+            >>> import lightgbm
+            >>> import numpy as np
+            >>> from ray.train.lightgbm import LightGBMCheckpoint
+            >>>
+            >>> train_X = np.array([[1, 2], [3, 4]])
+            >>> train_y = np.array([0, 1])
+            >>>
+            >>> model = lightgbm.LGBMClassifier().fit(train_X, train_y)
+            >>> checkpoint = LightGBMCheckpoint.from_model(model.booster_)
+        """
+        checkpoint_path = Path(path or tempfile.mkdtemp())
+        if not checkpoint_path.is_dir():
+            raise ValueError(f"`path` must be a directory, but got: {checkpoint_path}")
+        booster.save_model(checkpoint_path.joinpath(cls.MODEL_FILENAME).as_posix())
+        checkpoint = cls.from_directory(checkpoint_path.as_posix())
+        if preprocessor:
+            checkpoint.set_preprocessor(preprocessor)
+        return checkpoint
+    def get_model(self) -> lightgbm.Booster:
+        """Retrieve the LightGBM model stored in this checkpoint."""
+        with self.as_directory() as checkpoint_path:
+            return lightgbm.Booster(
+                model_file=Path(checkpoint_path, self.MODEL_FILENAME).as_posix()
+            )

.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_predictor.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from typing import TYPE_CHECKING, List, Optional, Union
+import lightgbm
+import pandas as pd
+from pandas.api.types import is_object_dtype
+from ray.air.constants import TENSOR_COLUMN_NAME
+from ray.air.data_batch_type import DataBatchType
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+from ray.train.lightgbm import LightGBMCheckpoint
+from ray.train.predictor import Predictor
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    from ray.data.preprocessor import Preprocessor
+@PublicAPI(stability="beta")
+class LightGBMPredictor(Predictor):
+    """A predictor for LightGBM models.
+    Args:
+        model: The LightGBM booster to use for predictions.
+        preprocessor: A preprocessor used to transform data batches prior
+            to prediction.
+    """
+    def __init__(
+        self, model: lightgbm.Booster, preprocessor: Optional["Preprocessor"] = None
+    ):
+        self.model = model
+        super().__init__(preprocessor)
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(model={self.model!r}, "
+            f"preprocessor={self._preprocessor!r})"
+        )
+    @classmethod
+    def from_checkpoint(cls, checkpoint: LightGBMCheckpoint) -> "LightGBMPredictor":
+        """Instantiate the predictor from a LightGBMCheckpoint.
+        Args:
+            checkpoint: The checkpoint to load the model and preprocessor from.
+        """
+        model = checkpoint.get_model()
+        preprocessor = checkpoint.get_preprocessor()
+        return cls(model=model, preprocessor=preprocessor)
+    def predict(
+        self,
+        data: DataBatchType,
+        feature_columns: Optional[Union[List[str], List[int]]] = None,
+        **predict_kwargs,
+    ) -> DataBatchType:
+        """Run inference on data batch.
+        Args:
+            data: A batch of input data.
+            feature_columns: The names or indices of the columns in the
+                data to use as features to predict on. If None, then use
+                all columns in ``data``.
+            **predict_kwargs: Keyword arguments passed to
+                ``lightgbm.Booster.predict``.
+        Examples:
+            >>> import numpy as np
+            >>> import lightgbm as lgbm
+            >>> from ray.train.lightgbm import LightGBMPredictor
+            >>>
+            >>> train_X = np.array([[1, 2], [3, 4]])
+            >>> train_y = np.array([0, 1])
+            >>>
+            >>> model = lgbm.LGBMClassifier().fit(train_X, train_y)
+            >>> predictor = LightGBMPredictor(model=model.booster_)
+            >>>
+            >>> data = np.array([[1, 2], [3, 4]])
+            >>> predictions = predictor.predict(data)
+            >>>
+            >>> # Only use first and second column as the feature
+            >>> data = np.array([[1, 2, 8], [3, 4, 9]])
+            >>> predictions = predictor.predict(data, feature_columns=[0, 1])
+            >>> import pandas as pd
+            >>> import lightgbm as lgbm
+            >>> from ray.train.lightgbm import LightGBMPredictor
+            >>>
+            >>> train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+            >>> train_y = pd.Series([0, 1])
+            >>>
+            >>> model = lgbm.LGBMClassifier().fit(train_X, train_y)
+            >>> predictor = LightGBMPredictor(model=model.booster_)
+            >>>
+            >>> # Pandas dataframe.
+            >>> data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+            >>> predictions = predictor.predict(data)
+            >>>
+            >>> # Only use first and second column as the feature
+            >>> data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"])
+            >>> predictions = predictor.predict(data, feature_columns=["A", "B"])
+        Returns:
+            Prediction result.
+        """
+        return Predictor.predict(
+            self, data, feature_columns=feature_columns, **predict_kwargs
+        )
+    def _predict_pandas(
+        self,
+        data: "pd.DataFrame",
+        feature_columns: Optional[Union[List[str], List[int]]] = None,
+        **predict_kwargs,
+    ) -> pd.DataFrame:
+        feature_names = None
+        if TENSOR_COLUMN_NAME in data:
+            data = data[TENSOR_COLUMN_NAME].to_numpy()
+            data = _unwrap_ndarray_object_type_if_needed(data)
+            if feature_columns:
+                # In this case feature_columns is a list of integers
+                data = data[:, feature_columns]
+            # Turn into dataframe to make dtype resolution easy
+            data = pd.DataFrame(data, columns=feature_names)
+            data = data.infer_objects()
+            # Pandas does not detect categorical dtypes. Any remaining object
+            # dtypes are probably categories, so convert them.
+            # This will fail if we have a category composed entirely of
+            # integers, but this is the best we can do here.
+            update_dtypes = {}
+            for column in data.columns:
+                dtype = data.dtypes[column]
+                if is_object_dtype(dtype):
+                    update_dtypes[column] = pd.CategoricalDtype()
+            if update_dtypes:
+                data = data.astype(update_dtypes, copy=False)
+        elif feature_columns:
+            # feature_columns is a list of integers or strings
+            data = data[feature_columns]
+        df = pd.DataFrame(self.model.predict(data, **predict_kwargs))
+        df.columns = (
+            ["predictions"]
+            if len(df.columns) == 1
+            else [f"predictions_{i}" for i in range(len(df.columns))]
+        )
+        return df

.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_trainer.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import logging
+from functools import partial
+from typing import Any, Dict, Optional
+import lightgbm
+import ray
+from ray.train import Checkpoint
+from ray.train.constants import _DEPRECATED_VALUE, TRAIN_DATASET_KEY
+from ray.train.lightgbm import RayTrainReportCallback
+from ray.train.lightgbm.v2 import LightGBMTrainer as SimpleLightGBMTrainer
+from ray.train.trainer import GenDataset
+from ray.util.annotations import PublicAPI
+logger = logging.getLogger(__name__)
+def _lightgbm_train_fn_per_worker(
+    config: dict,
+    label_column: str,
+    num_boost_round: int,
+    dataset_keys: set,
+    lightgbm_train_kwargs: dict,
+):
+    checkpoint = ray.train.get_checkpoint()
+    starting_model = None
+    remaining_iters = num_boost_round
+    if checkpoint:
+        starting_model = RayTrainReportCallback.get_model(checkpoint)
+        starting_iter = starting_model.current_iteration()
+        remaining_iters = num_boost_round - starting_iter
+        logger.info(
+            f"Model loaded from checkpoint will train for "
+            f"additional {remaining_iters} iterations (trees) in order "
+            "to achieve the target number of iterations "
+            f"({num_boost_round=})."
+        )
+    train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+    train_df = train_ds_iter.materialize().to_pandas()
+    eval_ds_iters = {
+        k: ray.train.get_dataset_shard(k)
+        for k in dataset_keys
+        if k != TRAIN_DATASET_KEY
+    }
+    eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()}
+    train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column]
+    train_set = lightgbm.Dataset(train_X, label=train_y)
+    # NOTE: Include the training dataset in the evaluation datasets.
+    # This allows `train-*` metrics to be calculated and reported.
+    valid_sets = [train_set]
+    valid_names = [TRAIN_DATASET_KEY]
+    for eval_name, eval_df in eval_dfs.items():
+        eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column]
+        valid_sets.append(lightgbm.Dataset(eval_X, label=eval_y))
+        valid_names.append(eval_name)
+    # Add network params of the worker group to enable distributed training.
+    config.update(ray.train.lightgbm.v2.get_network_params())
+    lightgbm.train(
+        params=config,
+        train_set=train_set,
+        num_boost_round=remaining_iters,
+        valid_sets=valid_sets,
+        valid_names=valid_names,
+        init_model=starting_model,
+        **lightgbm_train_kwargs,
+    )
+@PublicAPI(stability="beta")
+class LightGBMTrainer(SimpleLightGBMTrainer):
+    """A Trainer for data parallel LightGBM training.
+    This Trainer runs the LightGBM training loop in a distributed manner
+    using multiple Ray Actors.
+    If you would like to take advantage of LightGBM's built-in handling
+    for features with the categorical data type, consider applying the
+    :class:`Categorizer` preprocessor to set the dtypes in the dataset.
+    .. note::
+        ``LightGBMTrainer`` does not modify or otherwise alter the working
+        of the LightGBM distributed training algorithm.
+        Ray only provides orchestration, data ingest and fault tolerance.
+        For more information on LightGBM distributed training, refer to
+        `LightGBM documentation <https://lightgbm.readthedocs.io/>`__.
+    Example:
+        .. testcode::
+            import ray
+            from ray.train.lightgbm import LightGBMTrainer
+            from ray.train import ScalingConfig
+            train_dataset = ray.data.from_items(
+                [{"x": x, "y": x + 1} for x in range(32)]
+            )
+            trainer = LightGBMTrainer(
+                label_column="y",
+                params={"objective": "regression"},
+                scaling_config=ScalingConfig(num_workers=3),
+                datasets={"train": train_dataset},
+            )
+            result = trainer.fit()
+        .. testoutput::
+            :hide:
+            ...
+    Args:
+        datasets: The Ray Datasets to use for training and validation. Must include a
+            "train" key denoting the training dataset. All non-training datasets will
+            be used as separate validation sets, each reporting a separate metric.
+        label_column: Name of the label column. A column with this name
+            must be present in the training dataset.
+        params: LightGBM training parameters passed to ``lightgbm.train()``.
+            Refer to `LightGBM documentation <https://lightgbm.readthedocs.io>`_
+            for a list of possible parameters.
+        num_boost_round: Target number of boosting iterations (trees in the model).
+            Note that unlike in ``lightgbm.train``, this is the target number
+            of trees, meaning that if you set ``num_boost_round=10`` and pass a model
+            that has already been trained for 5 iterations, it will be trained for 5
+            iterations more, instead of 10 more.
+        scaling_config: Configuration for how to scale data parallel training.
+        run_config: Configuration for the execution of the training run.
+        resume_from_checkpoint: A checkpoint to resume training from.
+        metadata: Dict that should be made available in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+        **train_kwargs: Additional kwargs passed to ``lightgbm.train()`` function.
+    """
+    _handles_checkpoint_freq = True
+    _handles_checkpoint_at_end = True
+    def __init__(
+        self,
+        *,
+        datasets: Dict[str, GenDataset],
+        label_column: str,
+        params: Dict[str, Any],
+        num_boost_round: int = 10,
+        scaling_config: Optional[ray.train.ScalingConfig] = None,
+        run_config: Optional[ray.train.RunConfig] = None,
+        dataset_config: Optional[ray.train.DataConfig] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        dmatrix_params: Optional[Dict[str, Dict[str, Any]]] = _DEPRECATED_VALUE,
+        **train_kwargs,
+    ):
+        # TODO(justinvyu): [Deprecated] Remove in 2.11
+        if dmatrix_params != _DEPRECATED_VALUE:
+            raise DeprecationWarning(
+                "`dmatrix_params` is deprecated, since XGBoostTrainer no longer "
+                "depends on the `xgboost_ray.RayDMatrix` utility. "
+                "You can remove this argument and use `dataset_config` instead "
+                "to customize Ray Dataset ingestion."
+            )
+        # Initialize a default Ray Train metrics/checkpoint reporting callback if needed
+        callbacks = train_kwargs.get("callbacks", [])
+        user_supplied_callback = any(
+            isinstance(callback, RayTrainReportCallback) for callback in callbacks
+        )
+        callback_kwargs = {}
+        if run_config:
+            checkpoint_frequency = run_config.checkpoint_config.checkpoint_frequency
+            checkpoint_at_end = run_config.checkpoint_config.checkpoint_at_end
+            callback_kwargs["frequency"] = checkpoint_frequency
+            # Default `checkpoint_at_end=True` unless the user explicitly sets it.
+            callback_kwargs["checkpoint_at_end"] = (
+                checkpoint_at_end if checkpoint_at_end is not None else True
+            )
+        if not user_supplied_callback:
+            callbacks.append(RayTrainReportCallback(**callback_kwargs))
+        train_kwargs["callbacks"] = callbacks
+        train_fn_per_worker = partial(
+            _lightgbm_train_fn_per_worker,
+            label_column=label_column,
+            num_boost_round=num_boost_round,
+            dataset_keys=set(datasets),
+            lightgbm_train_kwargs=train_kwargs,
+        )
+        super(LightGBMTrainer, self).__init__(
+            train_loop_per_worker=train_fn_per_worker,
+            train_loop_config=params,
+            scaling_config=scaling_config,
+            run_config=run_config,
+            datasets=datasets,
+            dataset_config=dataset_config,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )
+    @classmethod
+    def get_model(
+        cls,
+        checkpoint: Checkpoint,
+    ) -> lightgbm.Booster:
+        """Retrieve the LightGBM model stored in this checkpoint."""
+        return RayTrainReportCallback.get_model(checkpoint)
+    def _validate_attributes(self):
+        super()._validate_attributes()
+        if TRAIN_DATASET_KEY not in self.datasets:
+            raise KeyError(
+                f"'{TRAIN_DATASET_KEY}' key must be preset in `datasets`. "
+                f"Got {list(self.datasets.keys())}"
+            )

.venv/lib/python3.11/site-packages/ray/train/lightgbm/v2.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import logging
+from typing import Any, Callable, Dict, Optional, Union
+import ray.train
+from ray.train import Checkpoint
+from ray.train.data_parallel_trainer import DataParallelTrainer
+from ray.train.lightgbm.config import LightGBMConfig, get_network_params  # noqa: F401
+from ray.train.trainer import GenDataset
+logger = logging.getLogger(__name__)
+class LightGBMTrainer(DataParallelTrainer):
+    """A Trainer for distributed data-parallel LightGBM training.
+    Example
+    -------
+    .. testcode::
+        import lightgbm as lgb
+        import ray.data
+        import ray.train
+        from ray.train.lightgbm import RayTrainReportCallback
+        from ray.train.lightgbm.v2 import LightGBMTrainer
+        def train_fn_per_worker(config: dict):
+            # (Optional) Add logic to resume training state from a checkpoint.
+            # ray.train.get_checkpoint()
+            # 1. Get the dataset shard for the worker and convert to a `lgb.Dataset`
+            train_ds_iter, eval_ds_iter = (
+                ray.train.get_dataset_shard("train"),
+                ray.train.get_dataset_shard("validation"),
+            )
+            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
+            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
+            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
+            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
+            train_set = lgb.Dataset(train_X, label=train_y)
+            eval_set = lgb.Dataset(eval_X, label=eval_y)
+            # 2. Run distributed data-parallel training.
+            # `get_network_params` sets up the necessary configurations for LightGBM
+            # to set up the data parallel training worker group on your Ray cluster.
+            params = {
+                "objective": "regression",
+                # Adding the line below is the only change needed
+                # for your `lgb.train` call!
+                **ray.train.lightgbm.v2.get_network_params(),
+            }
+            lgb.train(
+                params,
+                train_set,
+                valid_sets=[eval_set],
+                valid_names=["eval"],
+                callbacks=[RayTrainReportCallback()],
+            )
+        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+        eval_ds = ray.data.from_items(
+            [{"x": x, "y": x + 1} for x in range(32, 32 + 16)]
+        )
+        trainer = LightGBMTrainer(
+            train_fn_per_worker,
+            datasets={"train": train_ds, "validation": eval_ds},
+            scaling_config=ray.train.ScalingConfig(num_workers=4),
+        )
+        result = trainer.fit()
+        booster = RayTrainReportCallback.get_model(result.checkpoint)
+    .. testoutput::
+        :hide:
+        ...
+    Args:
+        train_loop_per_worker: The training function to execute on each worker.
+            This function can either take in zero arguments or a single ``Dict``
+            argument which is set by defining ``train_loop_config``.
+            Within this function you can use any of the
+            :ref:`Ray Train Loop utilities <train-loop-api>`.
+        train_loop_config: A configuration ``Dict`` to pass in as an argument to
+            ``train_loop_per_worker``.
+            This is typically used for specifying hyperparameters.
+        lightgbm_config: The configuration for setting up the distributed lightgbm
+            backend. See :class:`~ray.train.lightgbm.LightGBMConfig` for more info.
+        datasets: The Ray Datasets to use for training and validation.
+        dataset_config: The configuration for ingesting the input ``datasets``.
+            By default, all the Ray Dataset are split equally across workers.
+            See :class:`~ray.train.DataConfig` for more details.
+        scaling_config: The configuration for how to scale data parallel training.
+            ``num_workers`` determines how many Python processes are used for training,
+            and ``use_gpu`` determines whether or not each process should use GPUs.
+            See :class:`~ray.train.ScalingConfig` for more info.
+        run_config: The configuration for the execution of the training run.
+            See :class:`~ray.train.RunConfig` for more info.
+        resume_from_checkpoint: A checkpoint to resume training from.
+            This checkpoint can be accessed from within ``train_loop_per_worker``
+            by calling ``ray.train.get_checkpoint()``.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        lightgbm_config: Optional[LightGBMConfig] = None,
+        scaling_config: Optional[ray.train.ScalingConfig] = None,
+        run_config: Optional[ray.train.RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        dataset_config: Optional[ray.train.DataConfig] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        super(LightGBMTrainer, self).__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=lightgbm_config or LightGBMConfig(),
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )

.venv/lib/python3.11/site-packages/ray/train/predictor.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import abc
+from typing import Callable, Dict, Optional, Type, Union
+import numpy as np
+import pandas as pd
+from ray.air.data_batch_type import DataBatchType
+from ray.air.util.data_batch_conversion import (
+    BatchFormat,
+    _convert_batch_type_to_numpy,
+    _convert_batch_type_to_pandas,
+)
+from ray.data import Preprocessor
+from ray.train import Checkpoint
+from ray.util.annotations import DeveloperAPI, PublicAPI
+try:
+    import pyarrow
+    pa_table = pyarrow.Table
+except ImportError:
+    pa_table = None
+# Reverse mapping from data batch type to batch format.
+TYPE_TO_ENUM: Dict[Type[DataBatchType], BatchFormat] = {
+    np.ndarray: BatchFormat.NUMPY,
+    dict: BatchFormat.NUMPY,
+    pd.DataFrame: BatchFormat.PANDAS,
+}
+@PublicAPI(stability="beta")
+class PredictorNotSerializableException(RuntimeError):
+    """Error raised when trying to serialize a Predictor instance."""
+    pass
+@PublicAPI(stability="beta")
+class Predictor(abc.ABC):
+    """Predictors load models from checkpoints to perform inference.
+    .. note::
+        The base ``Predictor`` class cannot be instantiated directly. Only one of
+        its subclasses can be used.
+    **How does a Predictor work?**
+    Predictors expose a ``predict`` method that accepts an input batch of type
+    ``DataBatchType`` and outputs predictions of the same type as the input batch.
+    When the ``predict`` method is called the following occurs:
+    - The input batch is converted into a pandas DataFrame. Tensor input (like a
+      ``np.ndarray``) will be converted into a single column Pandas Dataframe.
+    - If there is a :ref:`Preprocessor <preprocessor-ref>` saved in the provided
+      :class:`Checkpoint <ray.train.Checkpoint>`, the preprocessor will be used to
+      transform the DataFrame.
+    - The transformed DataFrame will be passed to the model for inference (via the
+      ``predictor._predict_pandas`` method).
+    - The predictions will be outputted by ``predict`` in the same type as the
+      original input.
+    **How do I create a new Predictor?**
+    To implement a new Predictor for your particular framework, you should subclass
+    the base ``Predictor`` and implement the following two methods:
+    1. ``_predict_pandas``: Given a pandas.DataFrame input, return a
+       pandas.DataFrame containing predictions.
+    2. ``from_checkpoint``: Logic for creating a Predictor from a
+       :class:`Checkpoint <ray.train.Checkpoint>`.
+    3. Optionally ``_predict_numpy`` for better performance when working with
+       tensor data to avoid extra copies from Pandas conversions.
+    """
+    def __init__(self, preprocessor: Optional[Preprocessor] = None):
+        """Subclasseses must call Predictor.__init__() to set a preprocessor."""
+        self._preprocessor: Optional[Preprocessor] = preprocessor
+        # Whether tensor columns should be automatically cast from/to the tensor
+        # extension type at UDF boundaries. This can be overridden by subclasses.
+        self._cast_tensor_columns = False
+    @classmethod
+    @abc.abstractmethod
+    def from_checkpoint(cls, checkpoint: Checkpoint, **kwargs) -> "Predictor":
+        """Create a specific predictor from a checkpoint.
+        Args:
+            checkpoint: Checkpoint to load predictor data from.
+            kwargs: Arguments specific to predictor implementations.
+        Returns:
+            Predictor: Predictor object.
+        """
+        raise NotImplementedError
+    @classmethod
+    def from_pandas_udf(
+        cls, pandas_udf: Callable[[pd.DataFrame], pd.DataFrame]
+    ) -> "Predictor":
+        """Create a Predictor from a Pandas UDF.
+        Args:
+            pandas_udf: A function that takes a pandas.DataFrame and other
+                optional kwargs and returns a pandas.DataFrame.
+        """
+        class PandasUDFPredictor(Predictor):
+            @classmethod
+            def from_checkpoint(cls, checkpoint: Checkpoint, **kwargs) -> "Predictor":
+                return PandasUDFPredictor()
+            def _predict_pandas(self, df, **kwargs) -> "pd.DataFrame":
+                return pandas_udf(df, **kwargs)
+        return PandasUDFPredictor()
+    def get_preprocessor(self) -> Optional[Preprocessor]:
+        """Get the preprocessor to use prior to executing predictions."""
+        return self._preprocessor
+    def set_preprocessor(self, preprocessor: Optional[Preprocessor]) -> None:
+        """Set the preprocessor to use prior to executing predictions."""
+        self._preprocessor = preprocessor
+    @classmethod
+    @DeveloperAPI
+    def preferred_batch_format(cls) -> BatchFormat:
+        """Batch format hint for upstream producers to try yielding best block format.
+        The preferred batch format to use if both `_predict_pandas` and
+        `_predict_numpy` are implemented. Defaults to Pandas.
+        Can be overriden by predictor classes depending on the framework type,
+        e.g. TorchPredictor prefers Numpy and XGBoostPredictor prefers Pandas as
+        native batch format.
+        """
+        return BatchFormat.PANDAS
+    @classmethod
+    def _batch_format_to_use(cls) -> BatchFormat:
+        """Determine the batch format to use for the predictor."""
+        has_pandas_implemented = cls._predict_pandas != Predictor._predict_pandas
+        has_numpy_implemented = cls._predict_numpy != Predictor._predict_numpy
+        if has_pandas_implemented and has_numpy_implemented:
+            return cls.preferred_batch_format()
+        elif has_pandas_implemented:
+            return BatchFormat.PANDAS
+        elif has_numpy_implemented:
+            return BatchFormat.NUMPY
+        else:
+            raise NotImplementedError(
+                f"Predictor {cls.__name__} must implement at least one of "
+                "`_predict_pandas` and `_predict_numpy`."
+            )
+    def _set_cast_tensor_columns(self):
+        """Enable automatic tensor column casting.
+        If this is called on a predictor, the predictor will cast tensor columns to
+        NumPy ndarrays in the input to the preprocessors and cast tensor columns back to
+        the tensor extension type in the prediction outputs.
+        """
+        self._cast_tensor_columns = True
+    def predict(self, data: DataBatchType, **kwargs) -> DataBatchType:
+        """Perform inference on a batch of data.
+        Args:
+            data: A batch of input data of type ``DataBatchType``.
+            kwargs: Arguments specific to predictor implementations. These are passed
+            directly to ``_predict_numpy`` or ``_predict_pandas``.
+        Returns:
+            DataBatchType:
+                Prediction result. The return type will be the same as the input type.
+        """
+        if not hasattr(self, "_preprocessor"):
+            raise NotImplementedError(
+                "Subclasses of Predictor must call Predictor.__init__(preprocessor)."
+            )
+        try:
+            batch_format = TYPE_TO_ENUM[type(data)]
+        except KeyError:
+            raise RuntimeError(
+                f"Invalid input data type of {type(data)}, supported "
+                f"types: {list(TYPE_TO_ENUM.keys())}"
+            )
+        if self._preprocessor:
+            data = self._preprocessor.transform_batch(data)
+        batch_format_to_use = self._batch_format_to_use()
+        # We can finish prediction as long as one predict method is implemented.
+        # For prediction, we have to return back in the same format as the input.
+        if batch_format == BatchFormat.PANDAS:
+            if batch_format_to_use == BatchFormat.PANDAS:
+                return self._predict_pandas(
+                    _convert_batch_type_to_pandas(data), **kwargs
+                )
+            elif batch_format_to_use == BatchFormat.NUMPY:
+                return _convert_batch_type_to_pandas(
+                    self._predict_numpy(_convert_batch_type_to_numpy(data), **kwargs)
+                )
+        elif batch_format == BatchFormat.NUMPY:
+            if batch_format_to_use == BatchFormat.PANDAS:
+                return _convert_batch_type_to_numpy(
+                    self._predict_pandas(_convert_batch_type_to_pandas(data), **kwargs)
+                )
+            elif batch_format_to_use == BatchFormat.NUMPY:
+                return self._predict_numpy(_convert_batch_type_to_numpy(data), **kwargs)
+    @DeveloperAPI
+    def _predict_pandas(self, data: "pd.DataFrame", **kwargs) -> "pd.DataFrame":
+        """Perform inference on a Pandas DataFrame.
+        Args:
+            data: A pandas DataFrame to perform predictions on.
+            kwargs: Arguments specific to the predictor implementation.
+        Returns:
+            A pandas DataFrame containing the prediction result.
+        """
+        raise NotImplementedError
+    @DeveloperAPI
+    def _predict_numpy(
+        self, data: Union[np.ndarray, Dict[str, np.ndarray]], **kwargs
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        """Perform inference on a Numpy data.
+        All Predictors working with tensor data (like deep learning predictors)
+        should implement this method.
+        Args:
+            data: A Numpy ndarray or dictionary of ndarrays to perform predictions on.
+            kwargs: Arguments specific to the predictor implementation.
+        Returns:
+            A Numpy ndarray or dictionary of ndarray containing the prediction result.
+        """
+        raise NotImplementedError
+    def __reduce__(self):
+        raise PredictorNotSerializableException(
+            "Predictor instances are not serializable. Instead, you may want "
+            "to serialize a checkpoint and initialize the Predictor with "
+            "Predictor.from_checkpoint."
+        )

.venv/lib/python3.11/site-packages/ray/train/session.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/trainer.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import logging
+import traceback
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
+from ray.air._internal.util import (
+    StartTraceback,
+    StartTracebackWithWorkerRank,
+    skip_exceptions,
+)
+from ray.data import Dataset
+from ray.train import Checkpoint, DataConfig
+from ray.train._internal.backend_executor import (
+    BackendExecutor,
+    InactiveWorkerGroupError,
+    TrainBackendError,
+    TrainingWorkerError,
+)
+from ray.train._internal.session import _TrainingResult, _TrainSession, get_session
+from ray.train._internal.utils import ActorWrapper
+from ray.train.backend import BackendConfig
+from ray.train.base_trainer import (  # noqa: F401
+    BaseTrainer,
+    GenDataset,
+    TrainingFailedError,
+)
+from ray.util.annotations import DeveloperAPI
+T = TypeVar("T")
+S = TypeVar("S")
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class TrainingIterator:
+    """An iterator over Train results. Returned by ``trainer.run_iterator``."""
+    def __init__(
+        self,
+        backend_executor: Union[BackendExecutor, ActorWrapper],
+        backend_config: BackendConfig,
+        train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
+        datasets: Dict[str, Dataset],
+        metadata: Dict[str, Any],
+        data_config: DataConfig,
+        checkpoint: Optional[Union[Dict, str, Path, Checkpoint]],
+    ):
+        self._backend_executor = backend_executor
+        self._backend = backend_config.backend_cls()
+        self._train_func = train_func
+        self._datasets = datasets
+        self._metadata = metadata
+        self._data_config = data_config
+        self._start_training(
+            train_func=train_func,
+            datasets=self._datasets,
+            metadata=self._metadata,
+            data_config=self._data_config,
+            checkpoint=checkpoint,
+        )
+        self._finished_training = False
+    def __iter__(self):
+        return self
+    def _start_training(
+        self,
+        train_func,
+        datasets,
+        metadata,
+        data_config,
+        checkpoint: Optional[Checkpoint] = None,
+    ):
+        tune_session: _TrainSession = get_session()
+        assert tune_session, "`_start_training` should only be called from within Tune"
+        storage = tune_session.storage
+        self._run_with_error_handling(
+            lambda: self._backend_executor.start_training(
+                train_func=train_func,
+                datasets=datasets,
+                metadata=metadata,
+                data_config=data_config,
+                storage=storage,
+                checkpoint=checkpoint,
+            )
+        )
+    def _run_with_error_handling(self, func: Callable):
+        try:
+            return func()
+        except TrainingWorkerError:
+            # TODO(ml-team): This Train fault-tolerance code doesn't get used
+            # since max_retries=0
+            # Workers have already been restarted.
+            logger.info(
+                "Workers have been successfully restarted. Resuming "
+                "training from latest checkpoint."
+            )
+            self._start_training(
+                self._train_func,
+                self._datasets,
+                self._metadata,
+                self._data_config,
+            )
+            return self._run_with_error_handling(func)
+        except InactiveWorkerGroupError:
+            raise RuntimeError(
+                "This Trainer is not active. It is either shutdown "
+                "already or never started in the first place. "
+                "Either create a new Trainer or start this one."
+            ) from None
+        except TrainBackendError:
+            raise RuntimeError(
+                "Training failed. You should not be seeing "
+                "this error and this is a bug. Please create "
+                "a new issue at "
+                "https://github.com/ray-project/ray."
+            ) from None
+    def __next__(self):
+        if self.is_finished():
+            self._backend_executor.report_final_run_status(errored=False)
+            raise StopIteration
+        try:
+            next_results = self._run_with_error_handling(self._fetch_next_result)
+            if next_results is None:
+                self._backend_executor.report_final_run_status(errored=False)
+                self._run_with_error_handling(self._finish_training)
+                self._finished_training = True
+                raise StopIteration
+            else:
+                return next_results
+        except StartTraceback as e:
+            # If this is a StartTraceback, then this is a user error.
+            # We raise it directly
+            if isinstance(e, StartTracebackWithWorkerRank):
+                failed_rank = e.worker_rank
+            else:
+                failed_rank = None
+            # Extract the stack trace from the exception
+            e = skip_exceptions(e)
+            stack_trace = "".join(
+                traceback.format_exception(type(e), e, e.__traceback__)
+            )
+            self._backend_executor.report_final_run_status(
+                errored=True, stack_trace=stack_trace, failed_rank=failed_rank
+            )
+            try:
+                # Exception raised in at least one training worker. Immediately raise
+                # this error to the user and do not attempt to terminate gracefully.
+                self._backend_executor.shutdown(graceful_termination=False)
+                self._finished_training = True
+            except Exception:
+                pass
+            raise
+    def _fetch_next_result(self) -> Optional[List[Dict]]:
+        """Fetch next results produced by ``session.report()`` from each worker.
+        Assumes ``start_training`` has already been called.
+        Returns:
+            A list of dictionaries of values passed to ``session.report()`` from
+                each worker. Each item corresponds to an intermediate result
+                a single worker. If there are no more items to fetch,
+                returns None.
+        """
+        results = self._backend_executor.get_next_results()
+        if results is None:
+            return None
+        assert all(isinstance(result, _TrainingResult) for result in results)
+        return results
+    def _finish_training(self):
+        """Finish training and return final results. Propagate any exceptions.
+        Blocks until training is finished on all workers.
+        Assumes `start_training` has already been called.
+        Returns:
+            A list of return values from calling ``train_func`` on each worker.
+                Each item corresponds to the return value from a single worker.
+        """
+        return self._backend_executor.finish_training()
+    def is_finished(self) -> bool:
+        return self._finished_training

.venv/lib/python3.11/site-packages/ray/train/utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import warnings
+from ray.util.annotations import RayDeprecationWarning
+def _copy_doc(copy_func):
+    def wrapped(func):
+        func.__doc__ = copy_func.__doc__
+        return func
+    return wrapped
+def _log_deprecation_warning(message):
+    warnings.warn(
+        message,
+        RayDeprecationWarning,
+        stacklevel=2,
+    )

.venv/lib/python3.11/site-packages/ray/train/xgboost/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from ray.train.v2._internal.constants import is_v2_enabled
+from ray.train.xgboost._xgboost_utils import RayTrainReportCallback
+from ray.train.xgboost.config import XGBoostConfig
+from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint
+from ray.train.xgboost.xgboost_predictor import XGBoostPredictor
+from ray.train.xgboost.xgboost_trainer import XGBoostTrainer
+if is_v2_enabled():
+    from ray.train.v2.xgboost.xgboost_trainer import XGBoostTrainer  # noqa: F811
+__all__ = [
+    "RayTrainReportCallback",
+    "XGBoostCheckpoint",
+    "XGBoostConfig",
+    "XGBoostPredictor",
+    "XGBoostTrainer",
+]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (883 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/_xgboost_utils.cpython-311.pyc ADDED Viewed

Binary file (10.5 kB). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (10.7 kB). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/v2.cpython-311.pyc ADDED Viewed

Binary file (6.67 kB). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_checkpoint.cpython-311.pyc ADDED Viewed

Binary file (4.11 kB). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_predictor.cpython-311.pyc ADDED Viewed

Binary file (7.94 kB). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_trainer.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

.venv/lib/python3.11/site-packages/ray/train/xgboost/_xgboost_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import tempfile
+from collections import OrderedDict
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+from xgboost.core import Booster
+import ray.train
+from ray.train import Checkpoint
+from ray.tune.utils import flatten_dict
+from ray.util.annotations import PublicAPI
+try:
+    from xgboost.callback import TrainingCallback
+except ImportError:
+    class TrainingCallback:
+        pass
+class TuneCallback(TrainingCallback):
+    # TODO(justinvyu): [code_removal] Remove this after enforcing min xgboost version.
+    """Base class for Tune's XGBoost callbacks."""
+    def __call__(self, env):
+        """Compatibility with xgboost<1.3"""
+        return self.after_iteration(
+            env.model, env.iteration, env.evaluation_result_list
+        )
+    def after_iteration(self, model: Booster, epoch: int, evals_log: Dict):
+        raise NotImplementedError
+@PublicAPI(stability="beta")
+class RayTrainReportCallback(TuneCallback):
+    """XGBoost callback to save checkpoints and report metrics.
+    Args:
+        metrics: Metrics to report. If this is a list,
+            each item describes the metric key reported to XGBoost,
+            and it will be reported under the same name.
+            This can also be a dict of {<key-to-report>: <xgboost-metric-key>},
+            which can be used to rename xgboost default metrics.
+        filename: Customize the saved checkpoint file type by passing
+            a filename. Defaults to "model.ubj".
+        frequency: How often to save checkpoints, in terms of iterations.
+            Defaults to 0 (no checkpoints are saved during training).
+        checkpoint_at_end: Whether or not to save a checkpoint at the end of training.
+        results_postprocessing_fn: An optional Callable that takes in
+            the metrics dict that will be reported (after it has been flattened)
+            and returns a modified dict. For example, this can be used to
+            average results across CV fold when using ``xgboost.cv``.
+    Examples
+    --------
+    Reporting checkpoints and metrics to Ray Tune when running many
+    independent xgboost trials (without data parallelism within a trial).
+    .. testcode::
+        :skipif: True
+        import xgboost
+        from ray.tune import Tuner
+        from ray.train.xgboost import RayTrainReportCallback
+        def train_fn(config):
+            # Report log loss to Ray Tune after each validation epoch.
+            bst = xgboost.train(
+                ...,
+                callbacks=[
+                    RayTrainReportCallback(
+                        metrics={"loss": "eval-logloss"}, frequency=1
+                    )
+                ],
+            )
+        tuner = Tuner(train_fn)
+        results = tuner.fit()
+    Loading a model from a checkpoint reported by this callback.
+    .. testcode::
+        :skipif: True
+        from ray.train.xgboost import RayTrainReportCallback
+        # Get a `Checkpoint` object that is saved by the callback during training.
+        result = trainer.fit()
+        booster = RayTrainReportCallback.get_model(result.checkpoint)
+    """
+    CHECKPOINT_NAME = "model.ubj"
+    def __init__(
+        self,
+        metrics: Optional[Union[str, List[str], Dict[str, str]]] = None,
+        filename: str = CHECKPOINT_NAME,
+        frequency: int = 0,
+        checkpoint_at_end: bool = True,
+        results_postprocessing_fn: Optional[
+            Callable[[Dict[str, Union[float, List[float]]]], Dict[str, float]]
+        ] = None,
+    ):
+        if isinstance(metrics, str):
+            metrics = [metrics]
+        self._metrics = metrics
+        self._filename = filename
+        self._frequency = frequency
+        self._checkpoint_at_end = checkpoint_at_end
+        self._results_postprocessing_fn = results_postprocessing_fn
+        # Keeps track of the eval metrics from the last iteration,
+        # so that the latest metrics can be reported with the checkpoint
+        # at the end of training.
+        self._evals_log = None
+        # Keep track of the last checkpoint iteration to avoid double-checkpointing
+        # when using `checkpoint_at_end=True`.
+        self._last_checkpoint_iteration = None
+    @classmethod
+    def get_model(
+        cls, checkpoint: Checkpoint, filename: str = CHECKPOINT_NAME
+    ) -> Booster:
+        """Retrieve the model stored in a checkpoint reported by this callback.
+        Args:
+            checkpoint: The checkpoint object returned by a training run.
+                The checkpoint should be saved by an instance of this callback.
+            filename: The filename to load the model from, which should match
+                the filename used when creating the callback.
+        """
+        with checkpoint.as_directory() as checkpoint_path:
+            booster = Booster()
+            booster.load_model(Path(checkpoint_path, filename).as_posix())
+            return booster
+    def _get_report_dict(self, evals_log):
+        if isinstance(evals_log, OrderedDict):
+            # xgboost>=1.3
+            result_dict = flatten_dict(evals_log, delimiter="-")
+            for k in list(result_dict):
+                result_dict[k] = result_dict[k][-1]
+        else:
+            # xgboost<1.3
+            result_dict = dict(evals_log)
+        if not self._metrics:
+            report_dict = result_dict
+        else:
+            report_dict = {}
+            for key in self._metrics:
+                if isinstance(self._metrics, dict):
+                    metric = self._metrics[key]
+                else:
+                    metric = key
+                report_dict[key] = result_dict[metric]
+        if self._results_postprocessing_fn:
+            report_dict = self._results_postprocessing_fn(report_dict)
+        return report_dict
+    @contextmanager
+    def _get_checkpoint(self, model: Booster) -> Optional[Checkpoint]:
+        # NOTE: The world rank returns None for Tune usage without Train.
+        if ray.train.get_context().get_world_rank() in (0, None):
+            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                model.save_model(Path(temp_checkpoint_dir, self._filename).as_posix())
+                yield Checkpoint(temp_checkpoint_dir)
+        else:
+            yield None
+    def after_iteration(self, model: Booster, epoch: int, evals_log: Dict):
+        self._evals_log = evals_log
+        checkpointing_disabled = self._frequency == 0
+        # Ex: if frequency=2, checkpoint at epoch 1, 3, 5, ... (counting from 0)
+        should_checkpoint = (
+            not checkpointing_disabled and (epoch + 1) % self._frequency == 0
+        )
+        report_dict = self._get_report_dict(evals_log)
+        if should_checkpoint:
+            self._last_checkpoint_iteration = epoch
+            with self._get_checkpoint(model=model) as checkpoint:
+                ray.train.report(report_dict, checkpoint=checkpoint)
+        else:
+            ray.train.report(report_dict)
+    def after_training(self, model: Booster) -> Booster:
+        if not self._checkpoint_at_end:
+            return model
+        if (
+            self._last_checkpoint_iteration is not None
+            and model.num_boosted_rounds() - 1 == self._last_checkpoint_iteration
+        ):
+            # Avoids a duplicate checkpoint if the checkpoint frequency happens
+            # to align with the last iteration.
+            return model
+        report_dict = self._get_report_dict(self._evals_log) if self._evals_log else {}
+        with self._get_checkpoint(model=model) as checkpoint:
+            ray.train.report(report_dict, checkpoint=checkpoint)
+        return model

.venv/lib/python3.11/site-packages/ray/train/xgboost/config.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import json
+import logging
+import os
+import threading
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Optional
+import xgboost
+from packaging.version import Version
+from xgboost import RabitTracker
+from xgboost.collective import CommunicatorContext
+import ray
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import Backend, BackendConfig
+logger = logging.getLogger(__name__)
+@dataclass
+class XGBoostConfig(BackendConfig):
+    """Configuration for xgboost collective communication setup.
+    Ray Train will set up the necessary coordinator processes and environment
+    variables for your workers to communicate with each other.
+    Additional configuration options can be passed into the
+    `xgboost.collective.CommunicatorContext` that wraps your own `xgboost.train` code.
+    See the `xgboost.collective` module for more information:
+    https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/collective.py
+    Args:
+        xgboost_communicator: The backend to use for collective communication for
+            distributed xgboost training. For now, only "rabit" is supported.
+    """
+    xgboost_communicator: str = "rabit"
+    @property
+    def train_func_context(self):
+        @contextmanager
+        def collective_communication_context():
+            with CommunicatorContext(**_get_xgboost_args()):
+                yield
+        return collective_communication_context
+    @property
+    def backend_cls(self):
+        if self.xgboost_communicator == "rabit":
+            return (
+                _XGBoostRabitBackend
+                if Version(xgboost.__version__) >= Version("2.1.0")
+                else _XGBoostRabitBackend_pre_xgb210
+            )
+        raise NotImplementedError(f"Unsupported backend: {self.xgboost_communicator}")
+class _XGBoostRabitBackend(Backend):
+    def __init__(self):
+        self._tracker: Optional[RabitTracker] = None
+        self._wait_thread: Optional[threading.Thread] = None
+    def _setup_xgboost_distributed_backend(self, worker_group: WorkerGroup):
+        # Set up the rabit tracker on the Train driver.
+        num_workers = len(worker_group)
+        rabit_args = {"n_workers": num_workers}
+        train_driver_ip = ray.util.get_node_ip_address()
+        # NOTE: sortby="task" is needed to ensure that the xgboost worker ranks
+        # align with Ray Train worker ranks.
+        # The worker ranks will be sorted by `dmlc_task_id`,
+        # which is defined below.
+        self._tracker = RabitTracker(
+            n_workers=num_workers, host_ip=train_driver_ip, sortby="task"
+        )
+        self._tracker.start()
+        # The RabitTracker is started in a separate thread, and the
+        # `wait_for` method must be called for `worker_args` to return.
+        self._wait_thread = threading.Thread(target=self._tracker.wait_for, daemon=True)
+        self._wait_thread.start()
+        rabit_args.update(self._tracker.worker_args())
+        start_log = (
+            "RabitTracker coordinator started with parameters:\n"
+            f"{json.dumps(rabit_args, indent=2)}"
+        )
+        logger.debug(start_log)
+        def set_xgboost_communicator_args(args):
+            import ray.train
+            args["dmlc_task_id"] = (
+                f"[xgboost.ray-rank={ray.train.get_context().get_world_rank():08}]:"
+                f"{ray.get_runtime_context().get_actor_id()}"
+            )
+            _set_xgboost_args(args)
+        worker_group.execute(set_xgboost_communicator_args, rabit_args)
+    def on_training_start(
+        self, worker_group: WorkerGroup, backend_config: XGBoostConfig
+    ):
+        assert backend_config.xgboost_communicator == "rabit"
+        self._setup_xgboost_distributed_backend(worker_group)
+    def on_shutdown(self, worker_group: WorkerGroup, backend_config: XGBoostConfig):
+        timeout = 5
+        if self._wait_thread is not None:
+            self._wait_thread.join(timeout=timeout)
+            if self._wait_thread.is_alive():
+                logger.warning(
+                    "During shutdown, the RabitTracker thread failed to join "
+                    f"within {timeout} seconds. "
+                    "The process will still be terminated as part of Ray actor cleanup."
+                )
+class _XGBoostRabitBackend_pre_xgb210(Backend):
+    def __init__(self):
+        self._tracker: Optional[RabitTracker] = None
+    def _setup_xgboost_distributed_backend(self, worker_group: WorkerGroup):
+        # Set up the rabit tracker on the Train driver.
+        num_workers = len(worker_group)
+        rabit_args = {"DMLC_NUM_WORKER": num_workers}
+        train_driver_ip = ray.util.get_node_ip_address()
+        # NOTE: sortby="task" is needed to ensure that the xgboost worker ranks
+        # align with Ray Train worker ranks.
+        # The worker ranks will be sorted by `DMLC_TASK_ID`,
+        # which is defined below.
+        self._tracker = RabitTracker(
+            n_workers=num_workers, host_ip=train_driver_ip, sortby="task"
+        )
+        self._tracker.start(n_workers=num_workers)
+        worker_args = self._tracker.worker_envs()
+        rabit_args.update(worker_args)
+        start_log = (
+            "RabitTracker coordinator started with parameters:\n"
+            f"{json.dumps(rabit_args, indent=2)}"
+        )
+        logger.debug(start_log)
+        def set_xgboost_env_vars():
+            import ray.train
+            for k, v in rabit_args.items():
+                os.environ[k] = str(v)
+            # Ranks are assigned in increasing order of the worker's task id.
+            # This task id will be sorted by increasing world rank.
+            os.environ["DMLC_TASK_ID"] = (
+                f"[xgboost.ray-rank={ray.train.get_context().get_world_rank():08}]:"
+                f"{ray.get_runtime_context().get_actor_id()}"
+            )
+        worker_group.execute(set_xgboost_env_vars)
+    def on_training_start(
+        self, worker_group: WorkerGroup, backend_config: XGBoostConfig
+    ):
+        assert backend_config.xgboost_communicator == "rabit"
+        self._setup_xgboost_distributed_backend(worker_group)
+    def on_shutdown(self, worker_group: WorkerGroup, backend_config: XGBoostConfig):
+        if not self._tracker:
+            return
+        timeout = 5
+        self._tracker.thread.join(timeout=timeout)
+        if self._tracker.thread.is_alive():
+            logger.warning(
+                "During shutdown, the RabitTracker thread failed to join "
+                f"within {timeout} seconds. "
+                "The process will still be terminated as part of Ray actor cleanup."
+            )
+_xgboost_args: dict = {}
+_xgboost_args_lock = threading.Lock()
+def _set_xgboost_args(args):
+    with _xgboost_args_lock:
+        global _xgboost_args
+        _xgboost_args = args
+def _get_xgboost_args() -> dict:
+    with _xgboost_args_lock:
+        return _xgboost_args

.venv/lib/python3.11/site-packages/ray/train/xgboost/v2.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import logging
+from typing import Any, Callable, Dict, Optional, Union
+import ray.train
+from ray.train import Checkpoint
+from ray.train.data_parallel_trainer import DataParallelTrainer
+from ray.train.trainer import GenDataset
+from ray.train.xgboost import XGBoostConfig
+logger = logging.getLogger(__name__)
+class XGBoostTrainer(DataParallelTrainer):
+    """A Trainer for distributed data-parallel XGBoost training.
+    Example
+    -------
+    .. testcode::
+        import xgboost
+        import ray.data
+        import ray.train
+        from ray.train.xgboost import RayTrainReportCallback
+        from ray.train.xgboost.v2 import XGBoostTrainer
+        def train_fn_per_worker(config: dict):
+            # (Optional) Add logic to resume training state from a checkpoint.
+            # ray.train.get_checkpoint()
+            # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix`
+            train_ds_iter, eval_ds_iter = (
+                ray.train.get_dataset_shard("train"),
+                ray.train.get_dataset_shard("validation"),
+            )
+            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
+            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
+            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
+            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
+            dtrain = xgboost.DMatrix(train_X, label=train_y)
+            deval = xgboost.DMatrix(eval_X, label=eval_y)
+            params = {
+                "tree_method": "approx",
+                "objective": "reg:squarederror",
+                "eta": 1e-4,
+                "subsample": 0.5,
+                "max_depth": 2,
+            }
+            # 2. Do distributed data-parallel training.
+            # Ray Train sets up the necessary coordinator processes and
+            # environment variables for your workers to communicate with each other.
+            bst = xgboost.train(
+                params,
+                dtrain=dtrain,
+                evals=[(deval, "validation")],
+                num_boost_round=10,
+                callbacks=[RayTrainReportCallback()],
+            )
+        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+        eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)])
+        trainer = XGBoostTrainer(
+            train_fn_per_worker,
+            datasets={"train": train_ds, "validation": eval_ds},
+            scaling_config=ray.train.ScalingConfig(num_workers=4),
+        )
+        result = trainer.fit()
+        booster = RayTrainReportCallback.get_model(result.checkpoint)
+    .. testoutput::
+        :hide:
+        ...
+    Args:
+        train_loop_per_worker: The training function to execute on each worker.
+            This function can either take in zero arguments or a single ``Dict``
+            argument which is set by defining ``train_loop_config``.
+            Within this function you can use any of the
+            :ref:`Ray Train Loop utilities <train-loop-api>`.
+        train_loop_config: A configuration ``Dict`` to pass in as an argument to
+            ``train_loop_per_worker``.
+            This is typically used for specifying hyperparameters.
+        xgboost_config: The configuration for setting up the distributed xgboost
+            backend. Defaults to using the "rabit" backend.
+            See :class:`~ray.train.xgboost.XGBoostConfig` for more info.
+        datasets: The Ray Datasets to use for training and validation.
+        dataset_config: The configuration for ingesting the input ``datasets``.
+            By default, all the Ray Datasets are split equally across workers.
+            See :class:`~ray.train.DataConfig` for more details.
+        scaling_config: The configuration for how to scale data parallel training.
+            ``num_workers`` determines how many Python processes are used for training,
+            and ``use_gpu`` determines whether or not each process should use GPUs.
+            See :class:`~ray.train.ScalingConfig` for more info.
+        run_config: The configuration for the execution of the training run.
+            See :class:`~ray.train.RunConfig` for more info.
+        resume_from_checkpoint: A checkpoint to resume training from.
+            This checkpoint can be accessed from within ``train_loop_per_worker``
+            by calling ``ray.train.get_checkpoint()``.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        xgboost_config: Optional[XGBoostConfig] = None,
+        scaling_config: Optional[ray.train.ScalingConfig] = None,
+        run_config: Optional[ray.train.RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        dataset_config: Optional[ray.train.DataConfig] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        super(XGBoostTrainer, self).__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=xgboost_config or XGBoostConfig(),
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )

.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_checkpoint.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+import xgboost
+from ray.train._internal.framework_checkpoint import FrameworkCheckpoint
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    from ray.data.preprocessor import Preprocessor
+@PublicAPI(stability="beta")
+class XGBoostCheckpoint(FrameworkCheckpoint):
+    """A :py:class:`~ray.train.Checkpoint` with XGBoost-specific functionality."""
+    MODEL_FILENAME = "model.json"
+    @classmethod
+    def from_model(
+        cls,
+        booster: xgboost.Booster,
+        *,
+        preprocessor: Optional["Preprocessor"] = None,
+        path: Optional[str] = None,
+    ) -> "XGBoostCheckpoint":
+        """Create a :py:class:`~ray.train.Checkpoint` that stores an XGBoost
+        model.
+        Args:
+            booster: The XGBoost model to store in the checkpoint.
+            preprocessor: A fitted preprocessor to be applied before inference.
+            path: The path to the directory where the checkpoint file will be saved.
+                This should start as an empty directory, since the *entire*
+                directory will be treated as the checkpoint when reported.
+                By default, a temporary directory will be created.
+        Returns:
+            An :py:class:`XGBoostCheckpoint` containing the specified ``Estimator``.
+        Examples:
+            ... testcode::
+                import numpy as np
+                import ray
+                from ray.train.xgboost import XGBoostCheckpoint
+                import xgboost
+                train_X = np.array([[1, 2], [3, 4]])
+                train_y = np.array([0, 1])
+                model = xgboost.XGBClassifier().fit(train_X, train_y)
+                checkpoint = XGBoostCheckpoint.from_model(model.get_booster())
+        """
+        checkpoint_path = Path(path or tempfile.mkdtemp())
+        if not checkpoint_path.is_dir():
+            raise ValueError(f"`path` must be a directory, but got: {checkpoint_path}")
+        booster.save_model(checkpoint_path.joinpath(cls.MODEL_FILENAME).as_posix())
+        checkpoint = cls.from_directory(checkpoint_path.as_posix())
+        if preprocessor:
+            checkpoint.set_preprocessor(preprocessor)
+        return checkpoint
+    def get_model(self) -> xgboost.Booster:
+        """Retrieve the XGBoost model stored in this checkpoint."""
+        with self.as_directory() as checkpoint_path:
+            booster = xgboost.Booster()
+            booster.load_model(Path(checkpoint_path, self.MODEL_FILENAME).as_posix())
+            return booster

.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_predictor.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+import pandas as pd
+import xgboost
+from ray.air.constants import TENSOR_COLUMN_NAME
+from ray.air.data_batch_type import DataBatchType
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+from ray.train.predictor import Predictor
+from ray.train.xgboost import XGBoostCheckpoint
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    from ray.data.preprocessor import Preprocessor
+@PublicAPI(stability="beta")
+class XGBoostPredictor(Predictor):
+    """A predictor for XGBoost models.
+    Args:
+        model: The XGBoost booster to use for predictions.
+        preprocessor: A preprocessor used to transform data batches prior
+            to prediction.
+    """
+    def __init__(
+        self, model: xgboost.Booster, preprocessor: Optional["Preprocessor"] = None
+    ):
+        self.model = model
+        super().__init__(preprocessor)
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(model={self.model!r}, "
+            f"preprocessor={self._preprocessor!r})"
+        )
+    @classmethod
+    def from_checkpoint(cls, checkpoint: XGBoostCheckpoint) -> "XGBoostPredictor":
+        """Instantiate the predictor from a Checkpoint.
+        This is a helper constructor that instantiates the predictor from a
+        framework-specific XGBoost checkpoint.
+        Args:
+            checkpoint: The checkpoint to load the model and preprocessor from.
+        """
+        model = checkpoint.get_model()
+        preprocessor = checkpoint.get_preprocessor()
+        return cls(model=model, preprocessor=preprocessor)
+    def predict(
+        self,
+        data: DataBatchType,
+        feature_columns: Optional[Union[List[str], List[int]]] = None,
+        dmatrix_kwargs: Optional[Dict[str, Any]] = None,
+        **predict_kwargs,
+    ) -> DataBatchType:
+        """Run inference on data batch.
+        The data is converted into an XGBoost DMatrix before being inputted to
+        the model.
+        Args:
+            data: A batch of input data.
+            feature_columns: The names or indices of the columns in the
+                data to use as features to predict on. If None, then use
+                all columns in ``data``.
+            dmatrix_kwargs: Dict of keyword arguments passed to ``xgboost.DMatrix``.
+            **predict_kwargs: Keyword arguments passed to ``xgboost.Booster.predict``.
+        Examples:
+        .. testcode::
+            import numpy as np
+            import xgboost as xgb
+            from ray.train.xgboost import XGBoostPredictor
+            train_X = np.array([[1, 2], [3, 4]])
+            train_y = np.array([0, 1])
+            model = xgb.XGBClassifier().fit(train_X, train_y)
+            predictor = XGBoostPredictor(model=model.get_booster())
+            data = np.array([[1, 2], [3, 4]])
+            predictions = predictor.predict(data)
+            # Only use first and second column as the feature
+            data = np.array([[1, 2, 8], [3, 4, 9]])
+            predictions = predictor.predict(data, feature_columns=[0, 1])
+        .. testcode::
+            import pandas as pd
+            import xgboost as xgb
+            from ray.train.xgboost import XGBoostPredictor
+            train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+            train_y = pd.Series([0, 1])
+            model = xgb.XGBClassifier().fit(train_X, train_y)
+            predictor = XGBoostPredictor(model=model.get_booster())
+            # Pandas dataframe.
+            data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+            predictions = predictor.predict(data)
+            # Only use first and second column as the feature
+            data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"])
+            predictions = predictor.predict(data, feature_columns=["A", "B"])
+        Returns:
+            Prediction result.
+        """
+        return Predictor.predict(
+            self,
+            data,
+            feature_columns=feature_columns,
+            dmatrix_kwargs=dmatrix_kwargs,
+            **predict_kwargs,
+        )
+    def _predict_pandas(
+        self,
+        data: "pd.DataFrame",
+        feature_columns: Optional[Union[List[str], List[int]]] = None,
+        dmatrix_kwargs: Optional[Dict[str, Any]] = None,
+        **predict_kwargs,
+    ) -> "pd.DataFrame":
+        dmatrix_kwargs = dmatrix_kwargs or {}
+        feature_names = None
+        if TENSOR_COLUMN_NAME in data:
+            data = data[TENSOR_COLUMN_NAME].to_numpy()
+            data = _unwrap_ndarray_object_type_if_needed(data)
+            if feature_columns:
+                # In this case feature_columns is a list of integers
+                data = data[:, feature_columns]
+        elif feature_columns:
+            # feature_columns is a list of integers or strings
+            data = data[feature_columns].to_numpy()
+            # Only set the feature names if they are strings
+            if all(isinstance(fc, str) for fc in feature_columns):
+                feature_names = feature_columns
+        else:
+            feature_columns = data.columns.tolist()
+            data = data.to_numpy()
+            if all(isinstance(fc, str) for fc in feature_columns):
+                feature_names = feature_columns
+        if feature_names:
+            dmatrix_kwargs["feature_names"] = feature_names
+        matrix = xgboost.DMatrix(data, **dmatrix_kwargs)
+        df = pd.DataFrame(self.model.predict(matrix, **predict_kwargs))
+        df.columns = (
+            ["predictions"]
+            if len(df.columns) == 1
+            else [f"predictions_{i}" for i in range(len(df.columns))]
+        )
+        return df

.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_trainer.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import logging
+from functools import partial
+from typing import Any, Dict, Optional
+import xgboost
+from packaging.version import Version
+import ray.train
+from ray.train import Checkpoint
+from ray.train.constants import _DEPRECATED_VALUE, TRAIN_DATASET_KEY
+from ray.train.trainer import GenDataset
+from ray.train.xgboost import RayTrainReportCallback
+from ray.train.xgboost.v2 import XGBoostTrainer as SimpleXGBoostTrainer
+from ray.util.annotations import PublicAPI
+logger = logging.getLogger(__name__)
+def _xgboost_train_fn_per_worker(
+    config: dict,
+    label_column: str,
+    num_boost_round: int,
+    dataset_keys: set,
+    xgboost_train_kwargs: dict,
+):
+    checkpoint = ray.train.get_checkpoint()
+    starting_model = None
+    remaining_iters = num_boost_round
+    if checkpoint:
+        starting_model = RayTrainReportCallback.get_model(checkpoint)
+        starting_iter = starting_model.num_boosted_rounds()
+        remaining_iters = num_boost_round - starting_iter
+        logger.info(
+            f"Model loaded from checkpoint will train for "
+            f"additional {remaining_iters} iterations (trees) in order "
+            "to achieve the target number of iterations "
+            f"({num_boost_round=})."
+        )
+    train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+    train_df = train_ds_iter.materialize().to_pandas()
+    eval_ds_iters = {
+        k: ray.train.get_dataset_shard(k)
+        for k in dataset_keys
+        if k != TRAIN_DATASET_KEY
+    }
+    eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()}
+    train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column]
+    dtrain = xgboost.DMatrix(train_X, label=train_y)
+    # NOTE: Include the training dataset in the evaluation datasets.
+    # This allows `train-*` metrics to be calculated and reported.
+    evals = [(dtrain, TRAIN_DATASET_KEY)]
+    for eval_name, eval_df in eval_dfs.items():
+        eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column]
+        evals.append((xgboost.DMatrix(eval_X, label=eval_y), eval_name))
+    evals_result = {}
+    xgboost.train(
+        config,
+        dtrain=dtrain,
+        evals=evals,
+        evals_result=evals_result,
+        num_boost_round=remaining_iters,
+        xgb_model=starting_model,
+        **xgboost_train_kwargs,
+    )
+@PublicAPI(stability="beta")
+class XGBoostTrainer(SimpleXGBoostTrainer):
+    """A Trainer for data parallel XGBoost training.
+    This Trainer runs the XGBoost training loop in a distributed manner
+    using multiple Ray Actors.
+    .. note::
+        ``XGBoostTrainer`` does not modify or otherwise alter the working
+        of the XGBoost distributed training algorithm.
+        Ray only provides orchestration, data ingest and fault tolerance.
+        For more information on XGBoost distributed training, refer to
+        `XGBoost documentation <https://xgboost.readthedocs.io>`__.
+    Example:
+        .. testcode::
+            import ray
+            from ray.train.xgboost import XGBoostTrainer
+            from ray.train import ScalingConfig
+            train_dataset = ray.data.from_items(
+                [{"x": x, "y": x + 1} for x in range(32)])
+            trainer = XGBoostTrainer(
+                label_column="y",
+                params={"objective": "reg:squarederror"},
+                scaling_config=ScalingConfig(num_workers=3),
+                datasets={"train": train_dataset},
+            )
+            result = trainer.fit()
+        .. testoutput::
+            :hide:
+            ...
+    Args:
+        datasets: The Ray Datasets to use for training and validation. Must include a
+            "train" key denoting the training dataset. All non-training datasets will
+            be used as separate validation sets, each reporting a separate metric.
+        label_column: Name of the label column. A column with this name
+            must be present in the training dataset.
+        params: XGBoost training parameters.
+            Refer to `XGBoost documentation <https://xgboost.readthedocs.io/>`_
+            for a list of possible parameters.
+        num_boost_round: Target number of boosting iterations (trees in the model).
+            Note that unlike in ``xgboost.train``, this is the target number
+            of trees, meaning that if you set ``num_boost_round=10`` and pass a model
+            that has already been trained for 5 iterations, it will be trained for 5
+            iterations more, instead of 10 more.
+        scaling_config: Configuration for how to scale data parallel training.
+        run_config: Configuration for the execution of the training run.
+        dataset_config: The configuration for ingesting the input ``datasets``.
+            By default, all the Ray Datasets are split equally across workers.
+            See :class:`~ray.train.DataConfig` for more details.
+        resume_from_checkpoint: A checkpoint to resume training from.
+        metadata: Dict that should be made available in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+        **train_kwargs: Additional kwargs passed to ``xgboost.train()`` function.
+    """
+    _handles_checkpoint_freq = True
+    _handles_checkpoint_at_end = True
+    def __init__(
+        self,
+        *,
+        datasets: Dict[str, GenDataset],
+        label_column: str,
+        params: Dict[str, Any],
+        dmatrix_params: Optional[Dict[str, Dict[str, Any]]] = _DEPRECATED_VALUE,
+        num_boost_round: int = 10,
+        scaling_config: Optional[ray.train.ScalingConfig] = None,
+        run_config: Optional[ray.train.RunConfig] = None,
+        dataset_config: Optional[ray.train.DataConfig] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **train_kwargs,
+    ):
+        if Version(xgboost.__version__) < Version("1.7.0"):
+            raise ImportError(
+                "`XGBoostTrainer` requires the `xgboost` version to be >= 1.7.0. "
+                'Upgrade with: `pip install -U "xgboost>=1.7"`'
+            )
+        # TODO(justinvyu): [Deprecated] Remove in 2.11
+        if dmatrix_params != _DEPRECATED_VALUE:
+            raise DeprecationWarning(
+                "`dmatrix_params` is deprecated, since XGBoostTrainer no longer "
+                "depends on the `xgboost_ray.RayDMatrix` utility. "
+                "You can remove this argument and use `dataset_config` instead "
+                "to customize Ray Dataset ingestion."
+            )
+        # Initialize a default Ray Train metrics/checkpoint reporting callback if needed
+        callbacks = train_kwargs.get("callbacks", [])
+        user_supplied_callback = any(
+            isinstance(callback, RayTrainReportCallback) for callback in callbacks
+        )
+        callback_kwargs = {}
+        if run_config:
+            checkpoint_frequency = run_config.checkpoint_config.checkpoint_frequency
+            checkpoint_at_end = run_config.checkpoint_config.checkpoint_at_end
+            callback_kwargs["frequency"] = checkpoint_frequency
+            # Default `checkpoint_at_end=True` unless the user explicitly sets it.
+            callback_kwargs["checkpoint_at_end"] = (
+                checkpoint_at_end if checkpoint_at_end is not None else True
+            )
+        if not user_supplied_callback:
+            callbacks.append(RayTrainReportCallback(**callback_kwargs))
+        train_kwargs["callbacks"] = callbacks
+        train_fn_per_worker = partial(
+            _xgboost_train_fn_per_worker,
+            label_column=label_column,
+            num_boost_round=num_boost_round,
+            dataset_keys=set(datasets),
+            xgboost_train_kwargs=train_kwargs,
+        )
+        super(XGBoostTrainer, self).__init__(
+            train_loop_per_worker=train_fn_per_worker,
+            train_loop_config=params,
+            scaling_config=scaling_config,
+            run_config=run_config,
+            datasets=datasets,
+            dataset_config=dataset_config,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )
+    @classmethod
+    def get_model(
+        cls,
+        checkpoint: Checkpoint,
+    ) -> xgboost.Booster:
+        """Retrieve the XGBoost model stored in this checkpoint."""
+        return RayTrainReportCallback.get_model(checkpoint)
+    def _validate_attributes(self):
+        super()._validate_attributes()
+        if TRAIN_DATASET_KEY not in self.datasets:
+            raise KeyError(
+                f"'{TRAIN_DATASET_KEY}' key must be preset in `datasets`. "
+                f"Got {list(self.datasets.keys())}"
+            )

.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (5.87 kB). View file