koichi12 commited on Feb 12, 2025

Commit

d5967d1

verified ·

1 Parent(s): e549173

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py +5 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py +830 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py +185 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py +139 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py +103 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py +45 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/session.py +1163 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py +14 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py +158 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py +62 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py +126 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py +725 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py +490 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py +239 -0
.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py +426 -0
.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py +22 -0
.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/horovod/config.py +159 -0
.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py +202 -0
.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py +39 -0
.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc ADDED Viewed

Binary file (3.3 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc ADDED Viewed

Binary file (37.6 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc ADDED Viewed

Binary file (3.1 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc ADDED Viewed

Binary file (7.62 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc ADDED Viewed

Binary file (26.8 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc ADDED Viewed

Binary file (671 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc ADDED Viewed

Binary file (181 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (896 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (192 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc ADDED Viewed

Binary file (551 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc ADDED Viewed

Binary file (36.6 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc ADDED Viewed

Binary file (8.59 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc ADDED Viewed

Binary file (6.97 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc ADDED Viewed

Binary file (5.64 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc ADDED Viewed

Binary file (2.52 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc ADDED Viewed

Binary file (46.4 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc ADDED Viewed

Binary file (36.2 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc ADDED Viewed

Binary file (23.8 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc ADDED Viewed

Binary file (21.4 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import abc
+class Accelerator(abc.ABC):
+    """A utility that contains methods to accelerate training."""

.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py ADDED Viewed

	@@ -0,0 +1,830 @@

+import logging
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar
+import ray
+import ray._private.ray_constants as ray_constants
+from ray._private.ray_constants import env_integer
+from ray.data import Dataset
+from ray.exceptions import RayActorError
+from ray.train import Checkpoint, DataConfig
+from ray.train._internal.session import (
+    TrialInfo,
+    _TrainingResult,
+    get_session,
+    init_session,
+    shutdown_session,
+)
+from ray.train._internal.storage import StorageContext
+from ray.train._internal.utils import check_for_failure
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import BackendConfig
+from ray.train.constants import (
+    ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
+    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
+    ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
+    ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV,
+    ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV,
+    RAY_TRAIN_ENABLE_STATE_TRACKING,
+    TRAIN_ENABLE_WORKER_SPREAD_ENV,
+    TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
+)
+from ray.util.placement_group import get_current_placement_group, remove_placement_group
+T = TypeVar("T")
+logger = logging.getLogger(__name__)
+class TrainBackendError(Exception):
+    """Errors with BackendExecutor that should not be exposed to user."""
+class TrainingWorkerError(Exception):
+    """Raised if a worker fails during training."""
+@dataclass
+class ResourceConfig:
+    """
+    Resource configuration for resource_ids to share between workers.
+    Args:
+        resource_name: The name of the resource to configure
+         (Example: "neuron_cores" or "gpu").
+        resource_enable_sharing_env_var: The environment variable to
+         check if the resource should be shared.
+        share_resource_ids_env_var: The environment variable to configure for
+         sharing the resources with other workers.
+    """
+    resource_name: str
+    resource_enable_sharing_env_var: str
+    share_resource_ids_env_var: str
+class BackendExecutor:
+    """Main execution class for training backends.
+    This class holds a worker group and is responsible for executing the
+    training function on the workers, and collecting intermediate results
+    from ``session.report()``.
+    Args:
+        backend_config: The configurations for this
+            specific backend.
+        num_workers: Number of workers to use for training.
+        resources_per_worker (Optional[Dict[str, float]]):
+            Dictionary specifying the resources that will be
+            requested for each worker. Defaults to {"CPU": 1}.
+        max_retries: Number of retries when Ray actors fail.
+            Defaults to 3. Set to -1 for unlimited retries.
+    """
+    def __init__(
+        self,
+        backend_config: BackendConfig,
+        # TODO(xwjiang): Legacy Ray Train trainer clean up!
+        trial_info: Optional[TrialInfo] = None,
+        num_workers: int = 1,
+        resources_per_worker: Optional[Dict[str, float]] = None,
+        max_retries: int = 3,
+    ):
+        if resources_per_worker is None:
+            self._resources_per_worker = {"CPU": 1}
+        else:
+            self._resources_per_worker = resources_per_worker.copy()
+        self._backend_config = backend_config
+        self._backend = backend_config.backend_cls()
+        self._num_workers = num_workers
+        self._max_failures = max_retries
+        if self._max_failures < 0:
+            self._max_failures = float("inf")
+        self._num_failures = 0
+        self._last_failure = None
+        self._initialization_hook = None
+        self._placement_group = None
+        self._trial_info = trial_info
+        self.worker_group = InactiveWorkerGroup()
+        self.dataset_shards = None
+        self._resource_configs = [
+            ResourceConfig(
+                ray_constants.NEURON_CORES,
+                ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
+                ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR,
+            ),
+            ResourceConfig(
+                ray_constants.NPU,
+                ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV,
+                ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR,
+            ),
+            # For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var.
+            ResourceConfig(
+                ray_constants.GPU,
+                ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV,
+                ray_constants.ROCR_VISIBLE_DEVICES_ENV_VAR,
+            ),
+        ]
+        # Record the initialization time of BackendExecutor, which is
+        # after trainer.fit() and before worker_group executes the training function.
+        self._start_time_ms = int(time.time() * 1000)
+        self.state_tracking_enabled = env_integer(RAY_TRAIN_ENABLE_STATE_TRACKING, 0)
+    def start(
+        self,
+        initialization_hook: Optional[Callable[[], None]] = None,
+        train_cls: Optional[Type] = None,
+        train_cls_args: Optional[Tuple] = None,
+        train_cls_kwargs: Optional[Dict] = None,
+    ):
+        """Starts the worker group."""
+        self._create_placement_group()
+        placement_group = self._placement_group or "default"
+        self.worker_group = WorkerGroup(
+            num_workers=self._num_workers,
+            resources_per_worker=self._resources_per_worker,
+            actor_cls=train_cls,
+            actor_cls_args=train_cls_args,
+            actor_cls_kwargs=train_cls_kwargs,
+            placement_group=placement_group,
+        )
+        # Hack to avoid OOMs.
+        # This is just a temporary solution for Train loading entire checkpoints
+        # into memory by ensuring that the rank 0 worker is on the same node as
+        # trainable, thus allowing for lazy checkpoint transfer to be used.
+        # See https://github.com/ray-project/ray/issues/33073
+        # for more context.
+        # TODO remove passing in trial_driver_ip.
+        trial_driver_node_id = (
+            self._trial_info.driver_node_id if self._trial_info else None
+        )
+        self.worker_group.sort_workers_by_node_id_and_gpu_id(trial_driver_node_id)
+        try:
+            if initialization_hook:
+                self._initialization_hook = initialization_hook
+                self.worker_group.execute(initialization_hook)
+            # Always propagate the driver's DataContext to each worker in the group.
+            from ray.data import DataContext
+            def _set_driver_dataset_context(ctx: DataContext):
+                DataContext._set_current(ctx)
+            self.worker_group.execute(
+                _set_driver_dataset_context,
+                DataContext.get_current(),
+            )
+            share_cuda_visible_devices_enabled = bool(
+                env_integer(
+                    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
+                    self._backend.share_cuda_visible_devices,
+                )
+            )
+            if (
+                self._resources_per_worker.get("GPU", 0) > 0
+                and share_cuda_visible_devices_enabled
+            ):
+                self._share_cuda_visible_devices()
+            for resource_config in self._resource_configs:
+                if self._is_share_resources_enabled(
+                    resource_config.resource_name,
+                    resource_config.resource_enable_sharing_env_var,
+                ):
+                    self._share_resource_ids(
+                        resource_config.resource_name,
+                        resource_config.share_resource_ids_env_var,
+                    )
+            self._backend.on_start(self.worker_group, self._backend_config)
+        except RayActorError as exc:
+            logger.exception(str(exc))
+            logger.warning(
+                "Failure occurred during startup. Restarting all workers and "
+                "attempting to startup again."
+            )
+            self._increment_failures()
+            self._restart()
+        if self.state_tracking_enabled:
+            from ray.train._internal.state import TrainRunStateManager
+            from ray.train._internal.state.state_actor import get_state_actor
+            self.state_manager = TrainRunStateManager(state_actor=get_state_actor())
+    def _create_placement_group(self):
+        """Creates a placement group if it does not exist.
+        If a placement group is already detected (Tune) this will be a no-op.
+        By default the placement group will be created with PACK strategy.
+        This is optimized for colocating GPUs on a minimal number of nodes.
+        This behavior can be overridden to use the SPREAD strategy by defining
+        ``TRAIN_ENABLE_WORKER_SPREAD_ENV``
+        If a placement group is created it will be stored as
+        self._placement_group.
+        """
+        current_placement_group = get_current_placement_group()
+        worker = ray._private.worker.global_worker
+        should_capture_child_tasks_in_placement_group = (
+            worker.should_capture_child_tasks_in_placement_group
+        )
+        should_create_placement_group = (
+            current_placement_group is None
+            or not should_capture_child_tasks_in_placement_group
+        )
+        if should_create_placement_group:
+            bundles = [
+                self._resources_per_worker.copy() for _ in range(self._num_workers)
+            ]
+            use_spread = bool(env_integer(TRAIN_ENABLE_WORKER_SPREAD_ENV, 0))
+            strategy = "SPREAD" if use_spread else "PACK"
+            placement_group = ray.util.placement_group(bundles, strategy=strategy)
+            logger.debug("Waiting for placement group to start.")
+            timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100)
+            ready, _ = ray.wait([placement_group.ready()], timeout=timeout)
+            if ready:
+                logger.debug("Placement group has started.")
+            else:
+                raise TimeoutError(
+                    "Placement group creation timed out. Make sure your "
+                    "cluster either has enough resources or use an "
+                    "autoscaling cluster. If you are running on a cluster, "
+                    "make sure you specify an address in `ray.init()`, for example, "
+                    '`ray.init("auto")`. You can also increase the timeout by setting '
+                    "the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. "
+                    "Current resources available: {}, resources requested by the "
+                    "placement group: {}".format(
+                        ray.available_resources(), placement_group.bundle_specs
+                    )
+                )
+            self._placement_group = placement_group
+    def _share_cuda_visible_devices(self):
+        """Sets CUDA_VISIBLE_DEVICES on all workers.
+        For each worker, CUDA_VISIBLE_DEVICES will be set to the GPU IDs
+        visible to all workers on that worker's node.
+        This allows GPU workers on the same node to communicate with one
+        another.
+        Example:
+            Setup:
+            - Node1:
+                - Worker1: {0, 1}
+                - Worker2: {2, 3}
+            - Node2:
+                - Worker3: {0, 1}
+            CUDA_VISIBLE_DEVICES:
+            - Worker1: "0,1,2,3"
+            - Worker2: "0,1,2,3"
+            - Worker3: "0,1"
+        """
+        self._share_resource_ids(
+            ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR
+        )
+    def _share_resource_ids(self, resource: str, env_var: str):
+        """Sets the given env_var on all workers.
+        For each worker, the cores/devices are visible to all the
+        workers on that worker's node.This allows workers on the
+        same node to communicate with one another.
+        Example:
+            Setup:
+            - Node1:
+                - Worker1: {0, 1}
+                - Worker2: {2, 3}
+            - Node2:
+                - Worker3: {0, 1}
+            NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/...:
+            - Worker1: "0,1,2,3"
+            - Worker2: "0,1,2,3"
+            - Worker2: "0,1"
+        Args:
+            resource: The name of the resource/accelerator.
+            env_var: The name of the environment variable to set.
+        """
+        node_ids_and_resource_ids = [
+            (
+                w.metadata.node_id,
+                w.metadata.resource_ids[resource],
+            )
+            for w in self.worker_group.workers
+        ]
+        node_id_to_worker_id = defaultdict(set)
+        node_id_to_resource_ids = defaultdict(set)
+        for worker_id, (node_id, resource_ids) in enumerate(node_ids_and_resource_ids):
+            node_id_to_worker_id[node_id].add(worker_id)
+            node_id_to_resource_ids[node_id].update(resource_ids)
+        futures = []
+        for node_id, resource_ids in node_id_to_resource_ids.items():
+            resource_ids = sorted(resource_ids)
+            all_resource_ids = ",".join(resource_ids)
+            def set_resource_ids():
+                os.environ[env_var] = all_resource_ids
+            for worker_id in node_id_to_worker_id[node_id]:
+                futures.append(
+                    self.worker_group.execute_single_async(worker_id, set_resource_ids)
+                )
+        ray.get(futures)
+    def _is_share_resources_enabled(self, resource_name: str, enable_sharing_env: str):
+        """Whether to share resource IDs on all workers
+        based on enable_sharing_env.
+        This will return true if resources are requested and greater than 0.
+        Also, user can disable by configuring the `enable_sharing_env` to "0".
+        Args:
+            resource_name: The name of the resource/accelerator.
+            enable_sharing_env: The name of the environment variable
+                to check.
+        """
+        has_resource_requested = self._resources_per_worker.get(resource_name, 0) > 0
+        return has_resource_requested and ray_constants.env_bool(
+            enable_sharing_env, True
+        )
+    def _create_rank_world_size_mappings(self) -> List[Dict]:
+        """Create rank and world size mappings for workers.
+        There are three maps returned:
+            - local_rank_map, which maps from worker world_rank to local_rank.
+            - local_world_size_map, which maps from world_rank to local_world_size
+            - node_rank_map, which maps from world rank to node rank
+        Example:
+            Worker 0: node 0
+            Worker 1: node 0
+            Worker 2: node 1
+            Worker 3: node 0
+            Worker 4: node 1
+            Workers 0, 1, 3 are on node 0.
+            Workers 2, 4 are on node 1.
+            Expected local_rank_map:
+            {
+                0 -> 0,
+                1 -> 1,
+                2 -> 0,
+                3 -> 2,
+                4 -> 1
+            }
+            Expected local_world_size_map:
+            {
+                0 -> 3,
+                1 -> 3,
+                2 -> 2,
+                3 -> 3,
+                4 -> 2
+            }
+            Expected node_rank_map:
+            {
+                0 -> 0,
+                1 -> 0,
+                2 -> 1,
+                3 -> 0,
+                4 -> 1
+            }
+        """
+        local_rank_map = {}  # map from world rank to local rank
+        local_world_size_map = {}  # map from world rank to local world size
+        node_rank_map = {}  # map from world rank to node rank
+        node_ids = {}  # map from node id to node index
+        node_cnt = 0  # count the number of nodes
+        node_id_dict = defaultdict(
+            int
+        )  # map from node id to the number of workers on it.
+        for world_rank in range(len(self.worker_group)):
+            worker = self.worker_group.workers[world_rank]
+            node_id = worker.metadata.node_id
+            local_rank_map[world_rank] = node_id_dict[node_id]
+            node_id_dict[node_id] += 1
+            if node_id not in node_ids:
+                node_ids[node_id] = node_cnt
+                node_cnt += 1
+            node_rank_map[world_rank] = node_ids[node_id]
+        for world_rank in range(len(self.worker_group)):
+            worker = self.worker_group.workers[world_rank]
+            node_id = worker.metadata.node_id
+            local_world_size_map[world_rank] = node_id_dict[node_id]
+        workers_info = "\n".join(
+            [
+                f"- (node_id={w.metadata.node_id}, ip={w.metadata.node_ip}, "
+                f"pid={w.metadata.pid}) world_rank={i}, "
+                f"local_rank={local_rank_map[i]}, node_rank={node_rank_map[i]}"
+                for i, w in enumerate(self.worker_group.workers)
+            ]
+        )
+        logger.info(f"Started distributed worker processes: \n{workers_info}")
+        return local_rank_map, local_world_size_map, node_rank_map
+    def start_training(
+        self,
+        train_func: Callable[[], T],
+        datasets: Dict[str, Dataset],
+        metadata: Dict[str, Any],
+        data_config: DataConfig,
+        storage: StorageContext,
+        checkpoint: Optional[Checkpoint] = None,
+    ) -> None:
+        """Executes a training function on all workers in a separate thread.
+        ``finish_training`` should be called after this.
+        Args:
+            train_func: The training function to run on each worker.
+            datasets: The base datasets.
+            data_config: The config object for creating dataset shards for workers.
+            checkpoint: The checkpoint data that
+                should be loaded onto each worker and accessed by the
+                training function via ``session.get_checkpoint()``. If this
+                is ``None`` then no checkpoint will be loaded.
+        """
+        use_detailed_autofilled_metrics = env_integer(
+            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0
+        )
+        # First initialize the session.
+        def initialize_session(
+            train_func,
+            world_rank,
+            local_rank,
+            node_rank,
+            local_world_size,
+            world_size,
+            trial_info,
+            checkpoint,
+            dataset_shard,
+            metadata,
+            storage,
+        ):
+            try:
+                init_session(
+                    training_func=train_func,
+                    world_rank=world_rank,
+                    local_rank=local_rank,
+                    node_rank=node_rank,
+                    local_world_size=local_world_size,
+                    world_size=world_size,
+                    trial_info=trial_info,
+                    dataset_shard=dataset_shard,
+                    metadata=metadata,
+                    checkpoint=checkpoint,
+                    detailed_autofilled_metrics=use_detailed_autofilled_metrics,
+                    storage=storage,
+                )
+            except ValueError:
+                raise TrainBackendError(
+                    "Attempting to start training but a "
+                    "previous training run is still ongoing. "
+                    "You must call `finish_training` before "
+                    "calling `start_training` again."
+                )
+        if self.dataset_shards is None:
+            actors = [worker.actor for worker in self.worker_group.workers]
+            node_ids = [worker.metadata.node_id for worker in self.worker_group.workers]
+            self.dataset_shards = data_config.configure(
+                datasets,
+                world_size=len(self.worker_group),
+                worker_handles=actors,
+                worker_node_ids=node_ids,
+            )
+        (
+            local_rank_map,
+            local_world_size_map,
+            node_rank_map,
+        ) = self._create_rank_world_size_mappings()
+        futures = []
+        for index in range(len(self.worker_group)):
+            futures.append(
+                self.worker_group.execute_single_async(
+                    index,
+                    initialize_session,
+                    world_rank=index,
+                    local_rank=local_rank_map[index],
+                    node_rank=node_rank_map[index],
+                    local_world_size=local_world_size_map[index],
+                    world_size=len(self.worker_group),
+                    trial_info=self._trial_info,
+                    train_func=train_func,
+                    dataset_shard=self.dataset_shards[index],
+                    metadata=metadata,
+                    checkpoint=checkpoint,
+                    storage=storage,
+                )
+            )
+        self._backend.on_training_start(self.worker_group, self._backend_config)
+        self.get_with_failure_handling(futures)
+        # Register Train Run before training starts
+        if self.state_tracking_enabled:
+            from ray.train._internal.state.schema import RunStatusEnum
+            core_context = ray.runtime_context.get_runtime_context()
+            self.state_manager.register_train_run(
+                run_id=self._trial_info.run_id,
+                run_name=self._trial_info.experiment_name,
+                job_id=core_context.get_job_id(),
+                controller_actor_id=core_context.get_actor_id(),
+                datasets=datasets,
+                worker_group=self.worker_group,
+                start_time_ms=self._start_time_ms,
+                run_status=RunStatusEnum.RUNNING,
+            )
+        # Run the training function asynchronously in its own thread.
+        def train_async():
+            session = get_session()
+            session.start()
+        self.worker_group.execute_async(train_async)
+    def get_next_results(self) -> Optional[List[_TrainingResult]]:
+        """Fetches the next ``_TrainingResult`` from each worker.
+        Each ``_TrainingResult`` is expected to correspond to the same step from
+        each worker (e.g. the same call to ``train.report()``).
+        Returns:
+            A list of ``_TrainingResult``s or ``None`` if there are no more results
+            since the training function has exited on all workers.
+        """
+        def get_next():
+            session = _get_session("get_next_results")
+            try:
+                result = session.get_next()
+            except RuntimeError:
+                # Training thread has not been started yet.
+                raise TrainBackendError(
+                    "`get_next_results` has been called "
+                    "before `start_training`. Please call "
+                    "`start_training` before "
+                    "`get_next_results`."
+                )
+            return result
+        # Get next result from each worker.
+        futures = self.worker_group.execute_async(get_next)
+        results = self.get_with_failure_handling(futures)
+        # Check if any worker returned None.
+        if any(r is None for r in results):
+            # Either all workers have results or none of them do.
+            if not all(r is None for r in results):
+                raise RuntimeError(
+                    "Some workers returned results while "
+                    "others didn't. Make sure that "
+                    "`session.report()` are called the "
+                    "same number of times on all workers."
+                )
+            else:
+                # Return None if all results are None.
+                return None
+        return results
+    def pause_reporting(self):
+        """Disable workers from enqueuing results from ``session.report()``.
+        Note: Already reported results may still be enqueued at this point,
+              and should be handled appropriately.
+        """
+        def pause_session_reporting():
+            session = _get_session("pause_reporting")
+            return session.pause_reporting()
+        futures = self.worker_group.execute_async(pause_session_reporting)
+        self.get_with_failure_handling(futures)
+    def finish_training(self):
+        """Finish training and return final results. Propagate any exceptions.
+        Blocks until training is finished on all workers.
+        Assumes `start_training` has already been called.
+        Returns:
+            A list of return values from calling ``train_func`` on each worker.
+                Each item corresponds to the return value from a single worker.
+        """
+        def end_training():
+            session = _get_session("finish_training")
+            try:
+                # session.finish raises any Exceptions from training.
+                output = session.finish()
+            finally:
+                # Shutdown session even if session.finish() raises an
+                # Exception.
+                shutdown_session()
+            return output
+        futures = self.worker_group.execute_async(end_training)
+        results = self.get_with_failure_handling(futures)
+        return results
+    def report_final_run_status(
+        self,
+        errored: bool = False,
+        failed_rank: Optional[int] = None,
+        stack_trace: Optional[str] = None,
+    ):
+        """Report the final train run status, error, and end time to TrainStateActor."""
+        if self.state_tracking_enabled:
+            from ray.train._internal.state.schema import (
+                MAX_ERROR_STACK_TRACE_LENGTH,
+                RunStatusEnum,
+            )
+            if errored:
+                run_status = RunStatusEnum.ERRORED
+                status_detail = ""
+                if failed_rank is not None:
+                    status_detail += f"Rank {failed_rank} worker raised an error. \n"
+                if stack_trace is not None:
+                    # Keep only the last part of the stack trace if it's too long.
+                    status_detail += stack_trace[-MAX_ERROR_STACK_TRACE_LENGTH:]
+            else:
+                run_status = RunStatusEnum.FINISHED
+                status_detail = ""
+            self.state_manager.end_train_run(
+                run_id=self._trial_info.run_id,
+                run_status=run_status,
+                status_detail=status_detail,
+                end_time_ms=int(time.time() * 1000),
+            )
+    def get_with_failure_handling(self, remote_values):
+        """Gets the remote values while handling for worker failures.
+        This method should be called instead of ``ray.get()`` directly in
+        order to handle worker failures.
+        If a worker failure is identified, backend specific failure handling
+        is executed and a ``TrainingWorkerError`` is raised.
+        Args:
+            remote_values: List of object refs representing functions
+                that may fail in the middle of execution. For example, running
+                a Train training loop in multiple parallel actor calls.
+        Returns:
+            The resolved objects represented by the passed in ObjectRefs.
+        """
+        success, exception = check_for_failure(remote_values)
+        if success:
+            return ray.get(remote_values)
+        else:
+            self._last_failure = exception
+            self._increment_failures()
+            logger.warning(
+                "Failure identified during training. Restarting all workers and "
+                "continuing training from latest checkpoint."
+            )
+            self._restart()
+            raise TrainingWorkerError
+    def shutdown(self, graceful_termination: bool = True):
+        """Shuts down the workers in the worker group.
+        Args:
+            graceful_termination: If set to True, attempt to clean up the backend
+                before terminating the Ray actors.
+        """
+        if graceful_termination:
+            try:
+                self._backend.on_shutdown(self.worker_group, self._backend_config)
+            except RayActorError:
+                logger.warning(
+                    "Graceful shutdown of backend failed. This is "
+                    "expected if one of the workers has crashed."
+                )
+        if graceful_termination:
+            self.worker_group.shutdown()
+        else:
+            self.worker_group.shutdown(patience_s=0)
+        self.worker_group = InactiveWorkerGroup()
+        if self._placement_group:
+            remove_placement_group(self._placement_group)
+            self._placement_group = None
+        self.dataset_shards = None
+    def is_started(self):
+        return not isinstance(self.worker_group, InactiveWorkerGroup)
+    def _restart(self):
+        self.worker_group.shutdown()
+        if self._initialization_hook is not None:
+            initialization_hook = self._initialization_hook
+        else:
+            initialization_hook = None
+        if self._placement_group:
+            remove_placement_group(self._placement_group)
+            self._placement_group = None
+        self.start(initialization_hook=initialization_hook)
+    def _increment_failures(self):
+        self._num_failures += 1
+        if self._num_failures >= self._max_failures:
+            failure = self._last_failure
+            self._last_failure = None
+            if self._max_failures > 0:
+                exc = RuntimeError(
+                    "Training has failed after " f"{self._num_failures} " "attempts."
+                )
+                raise exc.with_traceback(None) from failure
+            else:
+                raise failure
+    def get_worker_group(self):
+        return self.worker_group
+    def _get_num_failures(self):
+        return self._num_failures
+class InactiveWorkerGroupError(Exception):
+    """Raised when underlying worker group is inactive."""
+class InactiveWorkerGroup:
+    # TODO: fix inheritence. perhaps create WorkerGroupInterface.
+    # Need to define getstate and setstate so that getattr does not screwup
+    # pickling. See https://stackoverflow.com/a/50888571/11249691
+    def __getstate__(self):
+        return vars(self)
+    def __setstate__(self, state):
+        vars(self).update(state)
+    def __getattr__(self, name):
+        raise InactiveWorkerGroupError()
+    def __len__(self):
+        raise InactiveWorkerGroupError()
+def _get_session(method_name: str):
+    # Get the session for this worker.
+    session = get_session()
+    if not session:
+        # Session is not initialized yet.
+        raise TrainBackendError(
+            f"`{method_name}` has been called "
+            "before `start_training`. Please call "
+            "`start_training` before "
+            f"`{method_name}`."
+        )
+    return session

.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import logging
+import numbers
+from typing import Any, Callable, List, Optional, Tuple
+from ray._private.dict import flatten_dict
+from ray.air._internal.util import is_nan
+from ray.air.config import MAX
+from ray.train import CheckpointConfig
+from ray.train._internal.session import _TrainingResult
+from ray.train._internal.storage import _delete_fs_path
+logger = logging.getLogger(__name__)
+def _insert_into_sorted_list(list: List[Any], item: Any, key: Callable[[Any], Any]):
+    """Insert an item into a sorted list with a custom key function.
+    Examples:
+        >>> list = []
+        >>> _insert_into_sorted_list(list, {"a": 1, "b": 0}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}]
+        >>> _insert_into_sorted_list(list, {"a": 3, "b": 1}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}, {'a': 3, 'b': 1}]
+        >>> _insert_into_sorted_list(list, {"a": 4, "b": 2}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}, {'a': 3, 'b': 1}, {'a': 4, 'b': 2}]
+        >>> _insert_into_sorted_list(list, {"a": 1, "b": 3}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}, {'a': 1, 'b': 3}, {'a': 3, 'b': 1}, {'a': 4, 'b': 2}]
+    """
+    i = 0
+    while i < len(list):
+        # Insert to the right of all duplicates.
+        if key(list[i]) > key(item):
+            break
+        i += 1
+    list.insert(i, item)
+class _CheckpointManager:
+    """Checkpoint manager that handles checkpoint book-keeping for a trial.
+    The main purpose of this abstraction is to keep the top K checkpoints based on
+    recency/a user-provided metric.
+    NOTE: This class interacts with `_TrainingResult` objects, which are
+    (checkpoint, metrics) pairs. This is to order checkpoints by metrics.
+    Args:
+        checkpoint_config: Defines how many and which checkpoints to keep.
+    """
+    def __init__(self, checkpoint_config: Optional[CheckpointConfig]):
+        self._checkpoint_config = checkpoint_config or CheckpointConfig()
+        # List of checkpoints ordered by ascending score.
+        self._checkpoint_results: List[_TrainingResult] = []
+        # The latest registered checkpoint.
+        # This should never be immediately deleted upon registration,
+        # even if it's not in the top K checkpoints, based on score.
+        self._latest_checkpoint_result: Optional[_TrainingResult] = None
+        if (
+            self._checkpoint_config.num_to_keep is not None
+            and self._checkpoint_config.num_to_keep <= 0
+        ):
+            raise ValueError(
+                f"`num_to_keep` must >= 1, got: "
+                f"{self._checkpoint_config.num_to_keep}"
+            )
+    @property
+    def checkpoint_config(self):
+        return self._checkpoint_config
+    def register_checkpoint(self, checkpoint_result: _TrainingResult):
+        """Register new checkpoint and add to bookkeeping.
+        This method will register a new checkpoint and add it to the internal
+        bookkeeping logic. This means the checkpoint manager will decide if
+        this checkpoint should be kept, and if older or worse performing
+        checkpoints should be deleted.
+        Args:
+            checkpoint: Tracked checkpoint object to add to bookkeeping.
+        """
+        self._latest_checkpoint_result = checkpoint_result
+        if self._checkpoint_config.checkpoint_score_attribute is not None:
+            # If we're ordering by a score, insert the checkpoint
+            # so that the list remains sorted.
+            _insert_into_sorted_list(
+                self._checkpoint_results,
+                checkpoint_result,
+                key=self._get_checkpoint_score,
+            )
+        else:
+            # If no metric is provided, just append (ordering by time of registration).
+            self._checkpoint_results.append(checkpoint_result)
+        if self._checkpoint_config.num_to_keep is not None:
+            # Delete the bottom (N - K) checkpoints
+            worst_results = set(
+                self._checkpoint_results[: -self._checkpoint_config.num_to_keep]
+            )
+            # Except for the latest checkpoint.
+            results_to_delete = worst_results - {self._latest_checkpoint_result}
+            # Update internal state before actually deleting them.
+            self._checkpoint_results = [
+                checkpoint_result
+                for checkpoint_result in self._checkpoint_results
+                if checkpoint_result not in results_to_delete
+            ]
+            for checkpoint_result in results_to_delete:
+                checkpoint = checkpoint_result.checkpoint
+                logger.debug("Deleting checkpoint: ", checkpoint)
+                _delete_fs_path(fs=checkpoint.filesystem, fs_path=checkpoint.path)
+    def _get_checkpoint_score(
+        self, checkpoint: _TrainingResult
+    ) -> Tuple[bool, numbers.Number]:
+        """Get the score for a checkpoint, according to checkpoint config.
+        If `mode="min"`, the metric is negated so that the lowest score is
+        treated as the best.
+        Returns:
+            Tuple: A tuple of (not_is_nan: bool, score: numbers.Number).
+                This score orders: nan values < float("-inf") < valid numeric metrics
+        """
+        checkpoint_score_attribute = self._checkpoint_config.checkpoint_score_attribute
+        if checkpoint_score_attribute:
+            flat_metrics = flatten_dict(checkpoint.metrics)
+            try:
+                checkpoint_result = flat_metrics[checkpoint_score_attribute]
+            except KeyError:
+                valid_keys = list(flat_metrics.keys())
+                logger.error(
+                    f"Result dict has no key: {checkpoint_score_attribute}. "
+                    f"checkpoint_score_attr must be set to a key in the "
+                    f"result dict. Valid keys are: {valid_keys}"
+                )
+                checkpoint_result = float("-inf")
+        else:
+            checkpoint_result = float("-inf")
+        checkpoint_score_order = self._checkpoint_config.checkpoint_score_order
+        order_factor = 1.0 if checkpoint_score_order == MAX else -1.0
+        checkpoint_score = order_factor * checkpoint_result
+        if not isinstance(checkpoint_score, numbers.Number):
+            raise ValueError(
+                f"Unable to persist checkpoint for "
+                f"checkpoint_score_attribute: "
+                f"{checkpoint_score_attribute} with value "
+                f"{checkpoint_score}. "
+                f"This attribute must be numerical."
+            )
+        return (
+            (not is_nan(checkpoint_score), checkpoint_score)
+            if not is_nan(checkpoint_score)
+            else (False, float("-inf"))
+        )
+    @property
+    def best_checkpoint_result(self) -> Optional[_TrainingResult]:
+        return self._checkpoint_results[-1] if self._checkpoint_results else None
+    @property
+    def latest_checkpoint_result(self) -> Optional[_TrainingResult]:
+        return self._latest_checkpoint_result
+    @property
+    def best_checkpoint_results(self) -> List[_TrainingResult]:
+        if self._checkpoint_config.num_to_keep is None:
+            return self._checkpoint_results
+        return self._checkpoint_results[-self._checkpoint_config.num_to_keep :]

.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import copy
+from typing import Dict, List, Literal, Optional, Union
+import ray
+from ray.actor import ActorHandle
+from ray.data import DataIterator, Dataset, ExecutionOptions, NodeIdStr
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+from ray.util.annotations import DeveloperAPI, PublicAPI
+@PublicAPI(stability="stable")
+class DataConfig:
+    """Class responsible for configuring Train dataset preprocessing.
+    For advanced use cases, this class can be subclassed and the `configure()` method
+    overriden for custom data preprocessing.
+    """
+    def __init__(
+        self,
+        datasets_to_split: Union[Literal["all"], List[str]] = "all",
+        execution_options: Optional[ExecutionOptions] = None,
+    ):
+        """Construct a DataConfig.
+        Args:
+            datasets_to_split: Specifies which datasets should be split among workers.
+                Can be set to "all" or a list of dataset names. Defaults to "all",
+                i.e. split all datasets.
+            execution_options: The execution options to pass to Ray Data. By default,
+                the options will be optimized for data ingest. When overriding this,
+                base your options off of `DataConfig.default_ingest_options()`.
+        """
+        if isinstance(datasets_to_split, list) or datasets_to_split == "all":
+            self._datasets_to_split = datasets_to_split
+        else:
+            raise TypeError(
+                "`datasets_to_split` should be a 'all' or a list of strings of "
+                "dataset names. Received "
+                f"{type(datasets_to_split).__name__} with value {datasets_to_split}."
+            )
+        self._execution_options: ExecutionOptions = (
+            execution_options or DataConfig.default_ingest_options()
+        )
+        self._num_train_cpus = 0.0
+        self._num_train_gpus = 0.0
+    def set_train_total_resources(self, num_train_cpus: float, num_train_gpus: float):
+        """Set the total number of CPUs and GPUs used by training.
+        If CPU or GPU resource limits are not set, they will be set to the
+        total cluster resources minus the resources used by training.
+        """
+        # TODO: We may also include other resources besides CPU and GPU.
+        self._num_train_cpus = num_train_cpus
+        self._num_train_gpus = num_train_gpus
+    @DeveloperAPI
+    def configure(
+        self,
+        datasets: Dict[str, Dataset],
+        world_size: int,
+        worker_handles: Optional[List[ActorHandle]],
+        worker_node_ids: Optional[List[NodeIdStr]],
+        **kwargs,
+    ) -> List[Dict[str, DataIterator]]:
+        """Configure how Train datasets should be assigned to workers.
+        Args:
+            datasets: The datasets dict passed to Train by the user.
+            world_size: The number of Train workers in total.
+            worker_handles: The actor handles of the Train workers.
+            worker_node_ids: The node ids of the Train workers.
+            kwargs: Forwards compatibility placeholder.
+        Returns:
+            A list of dataset splits for each worker. The size of the list must be
+            equal to `world_size`. Each element of the list contains the assigned
+            `DataIterator` instances by name for the worker.
+        """
+        output = [{} for _ in range(world_size)]
+        if self._datasets_to_split == "all":
+            datasets_to_split = set(datasets.keys())
+        else:
+            datasets_to_split = set(self._datasets_to_split)
+        locality_hints = (
+            worker_node_ids if self._execution_options.locality_with_output else None
+        )
+        for name, ds in datasets.items():
+            execution_options = copy.deepcopy(self._execution_options)
+            if execution_options.is_resource_limits_default():
+                # If "resource_limits" is not overriden by the user,
+                # add training-reserved resources to Data's exclude_resources.
+                execution_options.exclude_resources = (
+                    execution_options.exclude_resources.add(
+                        ExecutionResources(
+                            cpu=self._num_train_cpus, gpu=self._num_train_gpus
+                        )
+                    )
+                )
+            ds = ds.copy(ds)
+            ds.context.execution_options = execution_options
+            if name in datasets_to_split:
+                for i, split in enumerate(
+                    ds.streaming_split(
+                        world_size, equal=True, locality_hints=locality_hints
+                    )
+                ):
+                    output[i][name] = split
+            else:
+                for i in range(world_size):
+                    output[i][name] = ds.iterator()
+        return output
+    @staticmethod
+    def default_ingest_options() -> ExecutionOptions:
+        """The default Ray Data options used for data ingest.
+        By default, configurations are carried over from what is already set
+        in DataContext.
+        """
+        ctx = ray.data.DataContext.get_current()
+        return ExecutionOptions(
+            # TODO(hchen): Re-enable `locality_with_output` by default after fixing
+            # https://github.com/ray-project/ray/issues/40607
+            locality_with_output=ctx.execution_options.locality_with_output,
+            resource_limits=ctx.execution_options.resource_limits,
+            exclude_resources=ctx.execution_options.exclude_resources,
+            preserve_order=ctx.execution_options.preserve_order,
+            verbose_progress=ctx.execution_options.verbose_progress,
+        )

.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import abc
+from typing import Dict, Optional, TypeVar, Union
+import numpy as np
+import pandas as pd
+from ray.air.util.data_batch_conversion import (
+    BatchFormat,
+    _convert_batch_type_to_pandas,
+    _convert_pandas_to_batch_type,
+)
+from ray.train.predictor import Predictor
+from ray.util.annotations import DeveloperAPI
+TensorType = TypeVar("TensorType")
+TensorDtype = TypeVar("TensorDtype")
+class DLPredictor(Predictor):
+    @abc.abstractmethod
+    def _arrays_to_tensors(
+        self,
+        numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> Union[TensorType, Dict[str, TensorType]]:
+        """Converts a NumPy ndarray batch to the tensor type for the DL framework.
+        Args:
+            numpy_array: The numpy array to convert to a tensor.
+            dtype: The tensor dtype to use when creating the DL tensor.
+            ndarray: A (dict of) NumPy ndarray(s) that we wish to convert to a (dict of)
+                tensor(s).
+            dtype: A (dict of) tensor dtype(s) to use when creating the DL tensor; if
+                None, the dtype will be inferred from the NumPy ndarray data.
+        Returns:
+            A deep learning framework specific tensor.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def _tensor_to_array(self, tensor: TensorType) -> np.ndarray:
+        """Converts tensor framework specific tensor to a numpy array.
+        Args:
+            tensor: A framework specific tensor.
+        Returns:
+            A numpy array representing the input tensor.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    @DeveloperAPI
+    def call_model(
+        self, inputs: Union[TensorType, Dict[str, TensorType]]
+    ) -> Union[TensorType, Dict[str, TensorType]]:
+        """Inputs the tensor to the model for this Predictor and returns the result.
+        Args:
+            inputs: The tensor to input to the model.
+        Returns:
+            A tensor or dictionary of tensors containing the model output.
+        """
+        raise NotImplementedError
+    @classmethod
+    @DeveloperAPI
+    def preferred_batch_format(cls) -> BatchFormat:
+        return BatchFormat.NUMPY
+    def _predict_pandas(
+        self,
+        data: pd.DataFrame,
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> pd.DataFrame:
+        numpy_input = _convert_pandas_to_batch_type(
+            data,
+            BatchFormat.NUMPY,
+            self._cast_tensor_columns,
+        )
+        numpy_output = self._predict_numpy(numpy_input, dtype)
+        return _convert_batch_type_to_pandas(numpy_output)
+    def _predict_numpy(
+        self,
+        data: Union[np.ndarray, Dict[str, np.ndarray]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        # Single column selection return numpy array so preprocessors can be
+        # reused in both training and prediction
+        if isinstance(data, dict) and len(data) == 1:
+            data = next(iter(data.values()))
+        model_input = self._arrays_to_tensors(data, dtype)
+        model_output = self.call_model(model_input)
+        # TODO (jiaodong): Investigate perf implication of this.
+        # Move DL Tensor to CPU and convert to numpy.
+        if isinstance(model_output, dict):
+            return {k: self._tensor_to_array(v) for k, v in model_output.items()}
+        else:
+            return {"predictions": self._tensor_to_array(model_output)}

.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Optional
+import ray.cloudpickle as ray_pickle
+from ray._private.utils import binary_to_hex, hex_to_binary
+from ray.data.preprocessor import Preprocessor
+from ray.train._checkpoint import Checkpoint
+PREPROCESSOR_KEY = "preprocessor_pkl"
+class FrameworkCheckpoint(Checkpoint):
+    """A checkpoint to preserve the functionality of legacy
+    framework-specific checkpoints.
+    Example:
+        >>> import tempfile
+        >>> checkpoint = FrameworkCheckpoint(tempfile.mkdtemp())
+        >>> checkpoint.get_preprocessor() is None
+        True
+        >>> preprocessor = Preprocessor()
+        >>> preprocessor._attr = 1234
+        >>> checkpoint.set_preprocessor(preprocessor)
+        >>> checkpoint.get_preprocessor()._attr
+        1234
+    """
+    def get_preprocessor(self) -> Optional[Preprocessor]:
+        """Return the preprocessor stored in the checkpoint.
+        Returns:
+            The preprocessor stored in the checkpoint, or ``None`` if no
+            preprocessor was stored.
+        """
+        metadata = self.get_metadata()
+        preprocessor_bytes = metadata.get(PREPROCESSOR_KEY)
+        if preprocessor_bytes is None:
+            return None
+        return ray_pickle.loads(hex_to_binary(preprocessor_bytes))
+    def set_preprocessor(self, preprocessor: Preprocessor):
+        """Store a preprocessor with the checkpoint."""
+        self.update_metadata(
+            {PREPROCESSOR_KEY: binary_to_hex(ray_pickle.dumps(preprocessor))}
+        )

.venv/lib/python3.11/site-packages/ray/train/_internal/session.py ADDED Viewed

	@@ -0,0 +1,1163 @@

+import functools
+import logging
+import os
+import platform
+import queue
+import sys
+import threading
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Set, Type
+import ray
+from ray.air._internal.util import RunnerThread, StartTraceback
+from ray.air.constants import (
+    _ERROR_FETCH_TIMEOUT,
+    _RESULT_FETCH_TIMEOUT,
+    SESSION_MISUSE_LOG_ONCE_KEY,
+    TIME_THIS_ITER_S,
+    TIMESTAMP,
+)
+from ray.data import Dataset
+from ray.train import Checkpoint
+from ray.train._internal.accelerator import Accelerator
+from ray.train._internal.storage import StorageContext
+from ray.train.constants import (
+    CHECKPOINT_DIR_NAME,
+    DETAILED_AUTOFILLED_KEYS,
+    RAY_CHDIR_TO_TRIAL_DIR,
+    TIME_TOTAL_S,
+    WORKER_HOSTNAME,
+    WORKER_NODE_IP,
+    WORKER_PID,
+    _v2_migration_warnings_enabled,
+)
+from ray.train.error import SessionMisuseError
+from ray.train.utils import _log_deprecation_warning
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.util.debug import log_once
+from ray.util.placement_group import _valid_resource_shape
+from ray.util.scheduling_strategies import (
+    PlacementGroupSchedulingStrategy,
+    SchedulingStrategyT,
+)
+if TYPE_CHECKING:
+    from ray.data import DataIterator
+    from ray.tune.execution.placement_groups import PlacementGroupFactory
+logger = logging.getLogger(__name__)
+@dataclass
+class TrialInfo:
+    """The trial information to propagate to TrainSession."""
+    name: str
+    id: str
+    resources: Dict[str, float]
+    logdir: str
+    driver_ip: str
+    driver_node_id: str
+    experiment_name: Optional[str] = None
+    run_id: Optional[str] = None
+class _FutureTrainingResult:
+    """A future that will be resolved to a `_TrainingResult`.
+    This is needed for specific schedulers such as PBT that schedule saves.
+    This wrapper should be removed after refactoring PBT to not schedule saves anymore.
+    """
+    def __init__(self, future: ray.ObjectRef):
+        self.future = future
+    def resolve(self, block: bool = True) -> Optional["_TrainingResult"]:
+        """Resolve into ``_TrainingResult``.
+        This will return None for function trainables if no checkpoint has been
+        saved before.
+        """
+        if block:
+            timeout = None
+        else:
+            timeout = 1e-9
+        try:
+            return ray.get(self.future, timeout=timeout)
+        except TimeoutError:
+            # Not ready, yet
+            pass
+        except Exception as exc:
+            logger.error(f"Error resolving result: {exc}")
+class _TrainingResult:
+    """A (checkpoint, metrics) result reported by the user."""
+    def __init__(self, checkpoint: Optional[Checkpoint], metrics: Dict[str, Any]):
+        self.checkpoint = checkpoint
+        self.metrics = metrics
+    def __repr__(self) -> str:
+        return f"TrainingResult(checkpoint={self.checkpoint}, metrics={self.metrics})"
+# TODO(xwjiang): This needs a better name.
+@DeveloperAPI
+class _TrainSession:
+    """Holds information for training on each worker."""
+    def __init__(
+        self,
+        training_func: Callable,
+        world_rank: Optional[int],
+        local_rank: Optional[int],
+        node_rank: Optional[int],
+        local_world_size: Optional[int],
+        world_size: Optional[int],
+        trial_info: Optional[TrialInfo] = None,
+        dataset_shard: Optional[Dict[str, Dataset]] = None,
+        metadata: Dict[str, Any] = None,
+        checkpoint: Optional[Checkpoint] = None,
+        detailed_autofilled_metrics: bool = False,
+        storage: Optional[StorageContext] = None,
+        synchronous_result_reporting: bool = False,
+    ):
+        # `synchronous_result_reporting` refers to whether or not the
+        # training function is immediately unblocked to continue running
+        # after the main thread receives its result.
+        # Ex 1: For 2 Ray Train workers with synchronous_result_reporting=True,
+        # the worker that produces a result first will immediately will continue
+        # onto the next iteration.
+        # Ex 2: For a Tune function Trainable with `synchronous_result_reporting=False`,
+        # training will only continue with an explicit call to `session.get_next`.
+        # Synchronous reporting in example 2 is needed for Tune schedulers to
+        # be able to stop the execution of the training function at will,
+        # for advanced pausing schedulers (PBT, BOHB) and actor reuse.
+        self.synchronous_result_reporting = synchronous_result_reporting
+        # Ray Train worker properties
+        # Note: These are set to None for Tune function Trainables.
+        self.dataset_shard = dataset_shard
+        self.metadata = metadata
+        self.world_rank = world_rank
+        self.local_rank = local_rank
+        self.node_rank = node_rank
+        self.local_world_size = local_world_size
+        self.world_size = world_size
+        assert storage
+        logger.debug(f"StorageContext on SESSION (rank={world_rank}):\n{storage}")
+        # NOTE: `reset` will initialize many properties needed to start running the
+        # training_func as a thread.
+        self.reset(
+            training_func=training_func,
+            trial_info=trial_info,
+            storage=storage,
+            loaded_checkpoint=checkpoint,
+        )
+        # Autofilled metrics attributes.
+        self.detailed_autofilled_metrics = detailed_autofilled_metrics
+        self.last_report_time = time.time()
+        self.iteration = 0
+        self.time_total = 0.0
+        self.local_ip = self.get_current_ip()
+        self.accelerator = None
+        self._state = {}
+    def get_state(self, key: str) -> Any:
+        return self._state.get(key)
+    def set_state(self, key: str, value: Any):
+        self._state[key] = value
+    def get_current_ip(self):
+        self.local_ip = ray.util.get_node_ip_address()
+        return self.local_ip
+    def start(self):
+        """Starts the training thread."""
+        self.training_started = True
+        self.training_thread.start()
+    def reset(
+        self,
+        training_func: Callable,
+        trial_info: TrialInfo,
+        storage: StorageContext,
+        loaded_checkpoint=None,
+    ):
+        # This lock is used to control the execution of the training thread.
+        self.continue_lock = threading.Semaphore(0)
+        # This event is used to signal the training thread to stop.
+        self.stop_event = threading.Event()
+        # Queue for sending results across threads.
+        self.result_queue = queue.Queue(1)
+        # Queue for raising exceptions from runner thread to main thread.
+        # The error queue has a max size of one to prevent stacking error and force
+        # error reporting to block until finished.
+        self.error_queue = queue.Queue(1)
+        # The Thread object that is running the training function.
+        self.training_thread = RunnerThread(
+            target=training_func, daemon=True, error_queue=self.error_queue
+        )
+        # Possibly override with new state
+        self.trial_info = trial_info
+        self.storage = storage
+        self.loaded_checkpoint = loaded_checkpoint
+        # Reset state
+        self._state = {}
+        self.ignore_report = False
+        self.training_started = False
+        self._first_report = True
+        # Change the working directory to a special trial folder.
+        # This is to ensure that all Ray Train workers have a common working directory.
+        os.makedirs(storage.trial_working_directory, exist_ok=True)
+        if bool(int(os.environ.get(RAY_CHDIR_TO_TRIAL_DIR, "1"))):
+            logger.debug(
+                f"Changing the working directory to: {storage.trial_working_directory}"
+            )
+            os.chdir(storage.trial_working_directory)
+    def pause_reporting(self):
+        """Ignore all future ``session.report()`` calls."""
+        self.ignore_report = True
+    def finish(self, timeout: Optional[float] = None) -> Optional[Any]:
+        """Finishes the training thread.
+        Raises any Exception from training.
+        """
+        # Set the stop event for the training thread to gracefully exit.
+        self.stop_event.set()
+        # Release the lock so that training thread can process this event.
+        self.continue_lock.release()
+        # Force a final (blocking) sync of artifacts in the trial path to storage.
+        self.storage.persist_artifacts(force=True)
+        # Wait for training to finish.
+        # This will raise any errors that occur during training, including SystemError
+        # This returns the result of the training function.
+        output = None
+        if self.training_started:
+            output = self.training_thread.join(timeout=timeout)
+        return output
+    def get_next(self) -> Optional[_TrainingResult]:
+        """Gets the next ``_TrainingResult`` from the result queue.
+        If the result queue is empty, then this function returns ``None``.
+        """
+        if not self.training_started:
+            raise RuntimeError("Please call start before calling get_next.")
+        if self.synchronous_result_reporting:
+            # There's no need to release the lock on the first report
+            # since `start` already started the training thread.
+            if not self._first_report:
+                # Release the lock to trigger training to continue,
+                # until the next call to report.
+                self.continue_lock.release()
+            self._first_report = False
+        result = None
+        # While training is still ongoing, attempt to get the result.
+        while result is None and self.training_thread.is_alive():
+            try:
+                result = self.result_queue.get(
+                    block=True, timeout=_RESULT_FETCH_TIMEOUT
+                )
+            except queue.Empty:
+                pass
+        # If no result was found, then the runner must no longer be alive.
+        if result is None:
+            # Try one last time to fetch results in case results were
+            # reported in between the time of the last check and the
+            # termination of the thread runner.
+            try:
+                result = self.result_queue.get(
+                    block=False, timeout=_RESULT_FETCH_TIMEOUT
+                )
+            except queue.Empty:
+                pass
+        # check if error occurred inside the thread runner.
+        if result is None:
+            # only raise an error from the runner if all results are consumed
+            self._report_thread_runner_error(block=True)
+        else:
+            if not self.error_queue.empty():
+                logger.debug(
+                    (
+                        "Runner error waiting to be raised in main thread. "
+                        "Logging all available results first."
+                    )
+                )
+        if not self.synchronous_result_reporting:
+            # At this point, the training thread has reached
+            # the `train.report` and is blocked there.
+            # If performing asynchronous result reporting,
+            # release the lock to allow each worker to keep training
+            # immediately after the coordinator fetches their result.
+            self.continue_lock.release()
+        # Return None if there are no more results to fetch.
+        return result
+    def _auto_fill_metrics(self, result: dict) -> dict:
+        """Add autofilled metrics and update attributes."""
+        current_time = time.time()
+        current_datetime = datetime.now()
+        if TIME_THIS_ITER_S in result:
+            time_this_iter = result[TIME_THIS_ITER_S]
+        else:
+            time_this_iter = current_time - self.last_report_time
+        self.iteration += 1
+        self.time_total += time_this_iter
+        self.last_report_time = current_time
+        auto_filled_metrics = {
+            TIMESTAMP: int(time.mktime(current_datetime.timetuple())),
+            TIME_TOTAL_S: self.time_total,
+            WORKER_PID: os.getpid(),
+            WORKER_HOSTNAME: platform.node(),
+            WORKER_NODE_IP: self.local_ip,
+        }
+        if not self.detailed_autofilled_metrics:
+            auto_filled_metrics = {
+                k: v
+                for k, v in auto_filled_metrics.items()
+                if k not in DETAILED_AUTOFILLED_KEYS
+            }
+        result = result.copy()
+        result.update(auto_filled_metrics)
+        return result
+    def _auto_fill_checkpoint_metrics(self, result: dict) -> dict:
+        """Add autofilled metrics and update attributes."""
+        current_datetime = datetime.now()
+        auto_filled_metrics = {
+            TIMESTAMP: int(time.mktime(current_datetime.timetuple()))
+        }
+        result = result.copy()
+        result.update(auto_filled_metrics)
+        return result
+    def _report_thread_runner_error(self, block=False):
+        try:
+            e = self.error_queue.get(block=block, timeout=_ERROR_FETCH_TIMEOUT)
+            raise StartTraceback from e
+        except queue.Empty:
+            pass
+    def _report_training_result(self, training_result: _TrainingResult) -> None:
+        """Place a training result on the result queue for the main thread to process,
+        then block until the main thread signals that training should continue.
+        NOTE: This is used internally to report results from Train to Tune
+        without persisting checkpoints to storage 2 times.
+        `report` is the public API that directly persists to storage, which
+        should only be called by user code.
+        """
+        if training_result.checkpoint:
+            # NOTE: This populates `train.get_checkpoint`
+            self.loaded_checkpoint = training_result.checkpoint
+        # Add result to a thread-safe queue.
+        self.result_queue.put(training_result, block=True)
+        # Acquire lock to stop the training thread until main thread
+        # triggers resume.
+        self.continue_lock.acquire()
+        # If the trial should be terminated, exit gracefully.
+        # NOTE: This is only really useful if `synchronous_result_reporting=True`.
+        # Otherwise, the lock is immediately released on reporting, and this
+        # check is skipped before the main thread decides to set the stop event.
+        if self.stop_event.is_set():
+            self.stop_event.clear()
+            sys.exit(0)
+    def report(self, metrics: Dict, checkpoint: Optional[Checkpoint] = None) -> None:
+        # Special case: early fail for Torch tensors
+        if "torch" in sys.modules:
+            from ray.air._internal.torch_utils import contains_tensor
+            if contains_tensor(metrics):
+                raise ValueError(
+                    "Passing objects containg Torch tensors as metrics "
+                    "is not supported as it will throw an exception on "
+                    "deserialization. You can either convert the tensors "
+                    "to Python objects or report a `train.Checkpoint` "
+                    "with `ray.train.report` to store your Torch objects."
+                )
+        if self.ignore_report:
+            return
+        metrics = self._auto_fill_metrics(metrics)
+        persisted_checkpoint = None
+        if checkpoint:
+            self.storage._update_checkpoint_index(metrics)
+            # Persist the reported checkpoint files to storage.
+            persisted_checkpoint = self.storage.persist_current_checkpoint(checkpoint)
+            metrics[CHECKPOINT_DIR_NAME] = self.storage.checkpoint_dir_name
+        else:
+            metrics[CHECKPOINT_DIR_NAME] = None
+        # Persist trial artifacts to storage.
+        force_artifact_sync = (
+            persisted_checkpoint
+            and self.storage.sync_config.sync_artifacts_on_checkpoint
+        )
+        self.storage.persist_artifacts(force=force_artifact_sync)
+        # Set additional user metadata from the Trainer.
+        if persisted_checkpoint and self.metadata:
+            user_metadata = persisted_checkpoint.get_metadata()
+            for k, v in self.metadata.items():
+                # Update keys not already set by the user. This gives user-set keys
+                # precedence over keys set at the Trainer level.
+                if k not in user_metadata:
+                    user_metadata[k] = v
+            persisted_checkpoint.set_metadata(user_metadata)
+        result = _TrainingResult(checkpoint=persisted_checkpoint, metrics=metrics)
+        self._report_training_result(result)
+    @property
+    def experiment_name(self) -> str:
+        return self.trial_info.experiment_name
+    @property
+    def trial_name(self) -> str:
+        return self.trial_info.name
+    @property
+    def trial_id(self) -> str:
+        return self.trial_info.id
+    @property
+    def run_id(self) -> str:
+        return self.trial_info.run_id
+    @property
+    def trial_resources(self) -> "PlacementGroupFactory":
+        return self.trial_info.resources
+    @property
+    def trial_dir(self) -> str:
+        return self.trial_info.logdir
+    def get_dataset_shard(
+        self,
+        dataset_name: Optional[str] = None,
+    ) -> Optional["DataIterator"]:
+        shard = self.dataset_shard
+        if shard is None:
+            warnings.warn(
+                "No dataset passed in. Returning None. Make sure to "
+                "pass in a Dataset to Trainer.run to use this "
+                "function."
+            )
+        elif isinstance(shard, dict):
+            if not dataset_name:
+                raise RuntimeError(
+                    "Multiple datasets were passed into ``Trainer``, "
+                    "but no ``dataset_name`` is passed into "
+                    "``get_dataset_shard``. Please specify which "
+                    "dataset shard to retrieve."
+                )
+            return shard.get(dataset_name)
+        return shard
+# Cache of resource dicts that have been checked by the launch hook already.
+_checked_resources: Set[frozenset] = set()
+# Global _TrainSession object initialized by Ray Tune function trainables
+# and Ray Train V1 workers.
+_session: Optional[_TrainSession] = None
+def _tune_task_and_actor_launch_hook(
+    fn, resources: Dict[str, float], strategy: Optional[SchedulingStrategyT]
+):
+    """Launch hook to catch nested tasks that can't fit in the placement group.
+    This gives users a nice warning in case they launch a nested task in a Tune trial
+    without reserving resources in the trial placement group to fit it.
+    """
+    # Already checked, skip for performance reasons.
+    key = frozenset({(k, v) for k, v in resources.items() if v > 0})
+    if not key or key in _checked_resources:
+        return
+    # No need to check if placement group is None.
+    if (
+        not isinstance(strategy, PlacementGroupSchedulingStrategy)
+        or strategy.placement_group is None
+    ):
+        return
+    # Check if the resource request is targeting the current placement group.
+    cur_pg = ray.util.get_current_placement_group()
+    if not cur_pg or strategy.placement_group.id != cur_pg.id:
+        return
+    _checked_resources.add(key)
+    # Check if the request can be fulfilled by the current placement group.
+    pgf = get_trial_resources()
+    if pgf.head_bundle_is_empty:
+        available_bundles = cur_pg.bundle_specs[0:]
+    else:
+        available_bundles = cur_pg.bundle_specs[1:]
+    # Check if the request can be fulfilled by the current placement group.
+    if _valid_resource_shape(resources, available_bundles):
+        return
+    if fn.class_name:
+        submitted = "actor"
+        name = fn.module_name + "." + fn.class_name + "." + fn.function_name
+    else:
+        submitted = "task"
+        name = fn.module_name + "." + fn.function_name
+    # Normalize the resource spec so it looks the same as the placement group bundle.
+    main_resources = cur_pg.bundle_specs[0]
+    resources = {k: float(v) for k, v in resources.items() if v > 0}
+    raise RuntimeError(
+        f"No trial resources are available for launching the {submitted} `{name}`. "
+        "To resolve this, specify the Tune option:\n\n"
+        ">  resources_per_trial=tune.PlacementGroupFactory(\n"
+        f">    [{main_resources}] + [{resources}] * N\n"
+        ">  )\n\n"
+        f"Where `N` is the number of slots to reserve for trial {submitted}s. "
+        "If you are using a Ray training library, there might be a utility function "
+        "to set this automatically for you. For more information, refer to "
+        "https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html"
+    )
+def init_session(*args, **kwargs) -> None:
+    global _session
+    if _session:
+        raise ValueError(
+            "A Train session is already in use. Do not call "
+            "`init_session()` manually."
+        )
+    # Setup hooks for generating placement group resource deadlock warnings.
+    from ray import actor, remote_function
+    if "TUNE_DISABLE_RESOURCE_CHECKS" not in os.environ:
+        actor._actor_launch_hook = _tune_task_and_actor_launch_hook
+        remote_function._task_launch_hook = _tune_task_and_actor_launch_hook
+    _session = _TrainSession(*args, **kwargs)
+def get_session() -> Optional[_TrainSession]:
+    return _session
+def shutdown_session():
+    """Shuts down the initialized session."""
+    global _session
+    _session = None
+def _raise_accelerator_session_misuse():
+    """Raises a SessionMisuseError because a utility function was used improperly."""
+    raise SessionMisuseError(
+        "prepare/accelerate utility functions should be called inside a training "
+        "function executed by `Trainer.run`"
+    )
+def get_accelerator(default_accelerator_cls: Type[Accelerator]) -> Accelerator:
+    """The accelerator for this training session.
+    If an accelerator has not been set, then this method will construct an
+    accelerator using the provided accelerator class.
+    Raises:
+        SessionMisuseError: if the session is uninitialized.
+    """
+    session = get_session()
+    if session is None:
+        _raise_accelerator_session_misuse()
+    if session.accelerator is None:
+        session.accelerator = default_accelerator_cls()
+    return session.accelerator
+def set_accelerator(accelerator: Accelerator) -> None:
+    """Sets the accelerator for this training session.
+    Args:
+        accelerator: The accelerator to use for training.
+    Raises:
+        SessionMisuseError: if the session is unitialized.
+        RuntimeError: if the accelerator has already been set.
+    """
+    session = get_session()
+    if session is None:
+        _raise_accelerator_session_misuse()
+    if session.accelerator is not None:
+        raise RuntimeError("Cannot change accelerator once set.")
+    session.accelerator = accelerator
+def _warn_session_misuse(default_value: Any = None):
+    """Warns if fn is being used outside of session and returns ``default_value``."""
+    def inner(fn: Callable):
+        fn_name = fn.__name__
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            session = get_session()
+            if not session:
+                if log_once(f"{SESSION_MISUSE_LOG_ONCE_KEY}-{fn_name}"):
+                    warnings.warn(
+                        f"`{fn_name}` is meant to only be "
+                        "called inside a function that is executed by a Tuner"
+                        f" or Trainer. Returning `{default_value}`."
+                    )
+                return default_value
+            return fn(*args, **kwargs)
+        return wrapper
+    return inner
+@PublicAPI(stability="stable")
+@_warn_session_misuse()
+def report(metrics: Dict, *, checkpoint: Optional[Checkpoint] = None) -> None:
+    """Report metrics and optionally save a checkpoint.
+    If a checkpoint is provided, it will be
+    :ref:`persisted to storage <persistent-storage-guide>`.
+    If this is called in multiple distributed training workers:
+    - Only the metrics reported by the rank 0 worker will be tracked by Ray Train.
+      See :ref:`the metrics logging guide <train-monitoring-and-logging>`.
+    - A checkpoint will be registered as long as one or more workers reports
+      checkpoint that is not None.
+      See the :ref:`checkpointing guide <train-dl-saving-checkpoints>`.
+    - Checkpoints from multiple workers will be merged into one directory
+      in persistent storage.
+      See :ref:`the distributed checkpointing guide <train-distributed-checkpointing>`.
+    .. note::
+        Each invocation of this method will automatically increment the underlying
+        ``training_iteration`` number. The physical meaning of this "iteration" is
+        defined by user depending on how often they call ``report``.
+        It does not necessarily map to one epoch.
+    .. warning::
+        All workers must call `ray.train.report` the same number of times
+        so that Ray Train can properly synchronize the training state across
+        workers. Otherwise, your training will hang.
+    .. warning::
+        This method does NOT act as a barrier for distributed training workers.
+        Workers will upload their checkpoint, then continue training immediately.
+        If you need to synchronize workers, you can use a framework-native barrier
+        such as `torch.distributed.barrier()`.
+    Example:
+        .. testcode::
+            import tempfile
+            from ray import train
+            from ray.train import Checkpoint
+            from ray.train.torch import TorchTrainer
+            def train_func(config):
+                start_epoch = 0
+                checkpoint = train.get_checkpoint()
+                if checkpoint:
+                    with checkpoint.as_directory() as checkpoint_dir:
+                        # Load back training state
+                        ...
+                for epoch in range(start_epoch, config.get("num_epochs", 10)):
+                    # Do training...
+                    metrics = {"loss": ...}
+                    with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                       # Save the checkpoint...
+                       # torch.save(...)
+                        checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+                        # Example: Only the rank 0 worker uploads the checkpoint.
+                        if ray.train.get_context().get_world_rank() == 0:
+                            train.report(metrics, checkpoint=checkpoint)
+                        else:
+                            train.report(metrics, checkpoint=None)
+            trainer = TorchTrainer(
+                train_func, scaling_config=train.ScalingConfig(num_workers=2)
+            )
+    Args:
+        metrics: The metrics you want to report.
+        checkpoint: The optional checkpoint you want to report.
+    """
+    # If we are running in a Tune function, switch to `ray.tune.report`.
+    from ray.tune.trainable.trainable_fn_utils import _in_tune_session
+    if _in_tune_session():
+        import ray.tune
+        if _v2_migration_warnings_enabled():
+            _log_deprecation_warning(
+                "`ray.train.report` should be switched to "
+                "`ray.tune.report` when running in a function "
+                "passed to Ray Tune. This will be an error in the future."
+            )
+        return ray.tune.report(metrics, checkpoint=checkpoint)
+    get_session().report(metrics, checkpoint=checkpoint)
+@PublicAPI(stability="stable")
+@_warn_session_misuse()
+def get_checkpoint() -> Optional[Checkpoint]:
+    """Access the latest reported checkpoint to resume from if one exists.
+    Example:
+        .. testcode::
+            import tempfile
+            from ray import train
+            from ray.train import Checkpoint
+            from ray.train.torch import TorchTrainer
+            def train_func(config):
+                start_epoch = 0
+                checkpoint = train.get_checkpoint()
+                if checkpoint:
+                    with checkpoint.as_directory() as checkpoint_dir:
+                        # Load back training state
+                        ...
+                for epoch in range(start_epoch, config.get("num_epochs", 10)):
+                    # Do training...
+                    metrics = {"loss": ...}
+                    with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                       # Save the checkpoint...
+                        checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+                        train.report(metrics, checkpoint=checkpoint)
+            trainer = TorchTrainer(
+                train_func, scaling_config=train.ScalingConfig(num_workers=2)
+            )
+    Returns:
+        Checkpoint object if the session is currently being resumed.
+            Otherwise, return None.
+    """
+    # If we are running in a Tune function, switch to `ray.tune.get_checkpoint`.
+    from ray.tune.trainable.trainable_fn_utils import _in_tune_session
+    if _in_tune_session():
+        import ray.tune
+        if _v2_migration_warnings_enabled():
+            _log_deprecation_warning(
+                "`ray.train.get_checkpoint` should be switched to "
+                "`ray.tune.get_checkpoint` when running in a function "
+                "passed to Ray Tune. This will be an error in the future."
+            )
+        return ray.tune.get_checkpoint()
+    return get_session().loaded_checkpoint
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_metadata() -> Dict[str, Any]:
+    """User metadata dict passed to the Trainer constructor."""
+    return get_session().metadata
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_experiment_name() -> str:
+    """Experiment name for the corresponding trial."""
+    return get_session().experiment_name
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_name() -> str:
+    """Trial name for the corresponding trial."""
+    return get_session().trial_name
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_id() -> str:
+    """Trial id for the corresponding trial."""
+    return get_session().trial_id
+@PublicAPI(stability="alpha")
+@_warn_session_misuse()
+def get_run_id() -> str:
+    """Unique Train Run id for the corresponding trial."""
+    return get_session().run_id
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_resources() -> "PlacementGroupFactory":
+    """Trial resources for the corresponding trial."""
+    return get_session().trial_resources
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_dir() -> str:
+    """Log directory corresponding to the trial directory for a Tune session.
+    If calling from a Train session, this will give the trial directory of its parent
+    Tune session.
+    .. testcode::
+        from ray import train, tune
+        def train_func(config):
+            print(train.get_context().get_trial_dir())
+        tuner = tune.Tuner(train_func)
+        tuner.fit()
+    .. testoutput::
+        :options: +MOCK
+        /Users/root/ray_results/train_func_2023-07-19_15-01-37/train_func_d620c_00000_0_2023-07-19_15-01-40
+    """
+    return get_session().trial_dir
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=1)
+def get_world_size() -> int:
+    """Get the current world size (i.e. total number of workers) for this run.
+    .. testcode::
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.tensorflow import TensorflowTrainer
+        NUM_WORKERS = 2
+        def train_loop_per_worker(config):
+            assert train.get_context().get_world_size() == NUM_WORKERS
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TensorflowTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=NUM_WORKERS),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+    .. testoutput::
+        :hide:
+        ...
+    """
+    session = get_session()
+    if not hasattr(session, "world_size"):
+        raise RuntimeError(
+            "`get_world_size` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.world_size
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_world_rank() -> int:
+    """Get the world rank of this worker.
+    .. testcode::
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.tensorflow import TensorflowTrainer
+        def train_loop_per_worker(config):
+            if train.get_context().get_world_rank() == 0:
+                print("Worker 0")
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TensorflowTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=2),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+    .. testoutput::
+        :hide:
+        ...
+    """
+    session = get_session()
+    if not hasattr(session, "world_rank"):
+        raise RuntimeError(
+            "`get_world_rank` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.world_rank
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_local_rank() -> int:
+    """Get the local rank of this worker (rank of the worker on its node).
+    .. testcode::
+        import torch
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.torch import TorchTrainer
+        def train_loop_per_worker(config):
+            if torch.cuda.is_available():
+                torch.cuda.set_device(train.get_context().get_local_rank())
+            ...
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TorchTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=2, use_gpu=True),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+    .. testoutput::
+        :hide:
+        ...
+    """
+    session = get_session()
+    if not hasattr(session, "local_rank"):
+        raise RuntimeError(
+            "`get_local_rank` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.local_rank
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_local_world_size() -> int:
+    """Get the local world size of this node (i.e. number of workers on this node).
+    Example:
+        .. testcode::
+            import ray
+            from ray import train
+            from ray.train import ScalingConfig
+            from ray.train.torch import TorchTrainer
+            def train_loop_per_worker():
+                print(train.get_context().get_local_world_size())
+            train_dataset = ray.data.from_items(
+                [{"x": x, "y": x + 1} for x in range(32)])
+            trainer = TorchTrainer(train_loop_per_worker,
+                scaling_config=ScalingConfig(num_workers=1),
+                datasets={"train": train_dataset})
+            trainer.fit()
+        .. testoutput::
+            :hide:
+            ...
+    """
+    session = get_session()
+    if not hasattr(session, "local_world_size"):
+        raise RuntimeError(
+            "`get_local_world_size` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.local_world_size
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_node_rank() -> int:
+    """Get the rank of this node.
+    Example:
+        .. testcode::
+            import ray
+            from ray import train
+            from ray.train import ScalingConfig
+            from ray.train.torch import TorchTrainer
+            def train_loop_per_worker():
+                print(train.get_context().get_node_rank())
+            train_dataset = ray.data.from_items(
+                [{"x": x, "y": x + 1} for x in range(32)])
+            trainer = TorchTrainer(train_loop_per_worker,
+                scaling_config=ScalingConfig(num_workers=1),
+                datasets={"train": train_dataset})
+            trainer.fit()
+        .. testoutput::
+            :hide:
+            ...
+    """
+    session = get_session()
+    if not hasattr(session, "node_rank"):
+        raise RuntimeError(
+            "`get_node_rank` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.node_rank
+@PublicAPI(stability="stable")
+@_warn_session_misuse()
+def get_dataset_shard(
+    dataset_name: Optional[str] = None,
+) -> Optional["DataIterator"]:
+    """Returns the :class:`ray.data.DataIterator` shard for this worker.
+    Call :meth:`~ray.data.DataIterator.iter_torch_batches` or
+    :meth:`~ray.data.DataIterator.to_tf` on this shard to convert it to the
+    appropriate framework-specific data type.
+    .. testcode::
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.torch import TorchTrainer
+        def train_loop_per_worker(config):
+            ...
+            for epoch in range(2):
+                # Trainer will automatically handle sharding.
+                data_shard = train.get_dataset_shard("train")
+                for batch in data_shard.iter_torch_batches():
+                    ...
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TorchTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=2),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+    .. testoutput::
+        :hide:
+        ...
+    Args:
+        dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then
+            specifies which dataset shard to return.
+    Returns:
+        The ``DataIterator`` shard to use for this worker.
+        If no dataset is passed into Trainer, then return None.
+    """
+    session = get_session()
+    if not hasattr(session, "get_dataset_shard"):
+        raise RuntimeError(
+            "`get_dataset_shard` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.get_dataset_shard(dataset_name)
+@DeveloperAPI
+@_warn_session_misuse()
+def get_storage() -> StorageContext:
+    """Returns the :class:`~ray.train._internal.storage.StorageContext` storage
+    context which gives advanced access to the filesystem and paths
+    configured through `RunConfig`.
+    NOTE: This is a developer API, and the `StorageContext` interface may change
+    without notice between minor versions.
+    """
+    return get_session().storage

.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from ray.train._internal.state.state_manager import TrainRunStateManager
+try:
+    import pydantic  # noqa: F401
+except ImportError:
+    raise ModuleNotFoundError(
+        "pydantic isn't installed."
+        "To install pydantic, please run 'pip install pydantic'"
+    )
+__all__ = [
+    "TrainRunStateManager",
+]

.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (581 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc ADDED Viewed

Binary file (8.55 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc ADDED Viewed

Binary file (3.44 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc ADDED Viewed

Binary file (6.71 kB). View file

.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from enum import Enum
+from typing import List, Optional
+from ray._private.pydantic_compat import BaseModel, Field
+from ray.dashboard.modules.job.pydantic_models import JobDetails
+from ray.util.annotations import DeveloperAPI
+MAX_ERROR_STACK_TRACE_LENGTH = 50000
+@DeveloperAPI
+class RunStatusEnum(str, Enum):
+    """Enumeration for the status of a train run."""
+    # (Deprecated) Replaced by RUNNING.
+    # The train run has started
+    STARTED = "STARTED"
+    # The train run is running
+    RUNNING = "RUNNING"
+    # The train run was terminated as expected
+    FINISHED = "FINISHED"
+    # The train run was terminated early due to errors in the training function
+    ERRORED = "ERRORED"
+    # The train run was terminated early due to system errors or controller errors
+    ABORTED = "ABORTED"
+@DeveloperAPI
+class ActorStatusEnum(str, Enum):
+    DEAD = "DEAD"
+    ALIVE = "ALIVE"
+@DeveloperAPI
+class TrainWorkerInfo(BaseModel):
+    """Metadata of a Ray Train worker."""
+    actor_id: str = Field(description="Actor ID of the worker.")
+    world_rank: int = Field(description="World rank of the worker.")
+    local_rank: int = Field(description="Local rank of the worker.")
+    node_rank: int = Field(description="Node rank of the worker.")
+    node_id: str = Field(description="ID of the node that the worker is running on.")
+    node_ip: str = Field(
+        description="IP address of the node that the worker is running on."
+    )
+    pid: int = Field(description="Process ID of the worker.")
+    gpu_ids: List[int] = Field(
+        description="A list of GPU ids allocated to that worker."
+    )
+    status: Optional[ActorStatusEnum] = Field(
+        description="The status of the train worker actor. It can be ALIVE or DEAD."
+    )
+@DeveloperAPI
+class MemoryInfo(BaseModel):
+    rss: int
+    vms: int
+    pfaults: Optional[int]
+    pageins: Optional[int]
+@DeveloperAPI
+class ProcessStats(BaseModel):
+    cpuPercent: float
+    # total memory, free memory, memory used ratio
+    mem: Optional[List[int]]
+    memoryInfo: MemoryInfo
+class ProcessGPUUsage(BaseModel):
+    # This gpu usage stats from a process
+    pid: int
+    gpuMemoryUsage: int
+@DeveloperAPI
+class GPUStats(BaseModel):
+    uuid: str
+    index: int
+    name: str
+    utilizationGpu: Optional[float]
+    memoryUsed: float
+    memoryTotal: float
+    processInfo: ProcessGPUUsage
+@DeveloperAPI
+class TrainWorkerInfoWithDetails(TrainWorkerInfo):
+    """Metadata of a Ray Train worker."""
+    processStats: Optional[ProcessStats] = Field(
+        None, description="Process stats of the worker."
+    )
+    gpus: List[GPUStats] = Field(
+        default_factory=list,
+        description=(
+            "GPU stats of the worker. "
+            "Only returns GPUs that are attached to the worker process."
+        ),
+    )
+@DeveloperAPI
+class TrainDatasetInfo(BaseModel):
+    name: str = Field(
+        description="The key of the dataset dict specified in Ray Train Trainer."
+    )
+    dataset_uuid: str = Field(description="The uuid of the dataset.")
+    dataset_name: Optional[str] = Field(description="The name of the dataset.")
+@DeveloperAPI
+class TrainRunInfo(BaseModel):
+    """Metadata for a Ray Train run and information about its workers."""
+    name: str = Field(description="The name of the Train run.")
+    id: str = Field(description="The unique identifier for each Train run.")
+    job_id: str = Field(description="The Ray Job ID.")
+    controller_actor_id: str = Field(description="Actor Id of the Train controller.")
+    workers: List[TrainWorkerInfo] = Field(
+        description="A List of Train workers sorted by global ranks."
+    )
+    datasets: List[TrainDatasetInfo] = Field(
+        description="A List of dataset info for this Train run."
+    )
+    run_status: RunStatusEnum = Field(
+        description="The current status of the train run. It can be one of the "
+        "following: RUNNING, FINISHED, ERRORED, or ABORTED."
+    )
+    status_detail: str = Field(
+        description="Detailed information about the current run status, "
+        "such as error messages."
+    )
+    start_time_ms: int = Field(
+        description="The UNIX timestamp of the start time of this Train run."
+    )
+    end_time_ms: Optional[int] = Field(
+        description="The UNIX timestamp of the end time of this Train run. "
+        "If null, the Train run has not ended yet."
+    )
+@DeveloperAPI
+class TrainRunInfoWithDetails(TrainRunInfo):
+    """Metadata for a Ray Train run and information about its workers."""
+    workers: List[TrainWorkerInfoWithDetails] = Field(
+        description="A List of Train workers sorted by global ranks."
+    )
+    job_details: Optional[JobDetails] = Field(
+        None, description="Details of the job that started this Train run."
+    )
+@DeveloperAPI
+class TrainRunsResponse(BaseModel):
+    train_runs: List[TrainRunInfoWithDetails]

.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import logging
+import threading
+from typing import Dict, Optional
+import ray
+from ray.actor import ActorHandle
+from ray.train._internal.state.schema import TrainRunInfo
+logger = logging.getLogger(__name__)
+@ray.remote(num_cpus=0)
+class TrainStateActor:
+    def __init__(self):
+        self._run_infos: Dict[str, TrainRunInfo] = {}
+    def register_train_run(self, run_info: TrainRunInfo) -> None:
+        # Register a new train run.
+        self._run_infos[run_info.id] = run_info
+    def get_train_run(self, run_id: str) -> Optional[TrainRunInfo]:
+        # Retrieve a registered run with its id
+        return self._run_infos.get(run_id, None)
+    def get_all_train_runs(self) -> Dict[str, TrainRunInfo]:
+        # Retrieve all registered train runs
+        return self._run_infos
+TRAIN_STATE_ACTOR_NAME = "train_state_actor"
+TRAIN_STATE_ACTOR_NAMESPACE = "_train_state_actor"
+_state_actor_lock: threading.RLock = threading.RLock()
+def get_or_create_state_actor() -> ActorHandle:
+    """Get or create a `TrainStateActor` on the head node."""
+    with _state_actor_lock:
+        state_actor = TrainStateActor.options(
+            name=TRAIN_STATE_ACTOR_NAME,
+            namespace=TRAIN_STATE_ACTOR_NAMESPACE,
+            get_if_exists=True,
+            lifetime="detached",
+            resources={"node:__internal_head__": 0.001},
+            # Escape from the parent's placement group
+            scheduling_strategy="DEFAULT",
+        ).remote()
+    # Ensure the state actor is ready
+    ray.get(state_actor.__ray_ready__.remote())
+    return state_actor
+def get_state_actor() -> Optional[ActorHandle]:
+    """Get the `TrainStateActor` if exists, otherwise return None."""
+    try:
+        return ray.get_actor(
+            name=TRAIN_STATE_ACTOR_NAME,
+            namespace=TRAIN_STATE_ACTOR_NAMESPACE,
+        )
+    except ValueError:
+        return None

.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import logging
+import os
+from collections import defaultdict
+from typing import Any, Dict
+import ray
+from ray.data import Dataset
+from ray.train._internal.state.schema import (
+    RunStatusEnum,
+    TrainDatasetInfo,
+    TrainRunInfo,
+    TrainWorkerInfo,
+)
+from ray.train._internal.utils import check_for_failure
+from ray.train._internal.worker_group import WorkerGroup
+logger = logging.getLogger(__name__)
+class TrainRunStateManager:
+    """A class that aggregates and reports train run info to TrainStateActor.
+    This manager class is created on the train controller layer for each run.
+    """
+    def __init__(self, state_actor) -> None:
+        self.state_actor = state_actor
+        self.train_run_info_dict = defaultdict(dict)
+    def register_train_run(
+        self,
+        run_id: str,
+        job_id: str,
+        run_name: str,
+        run_status: str,
+        controller_actor_id: str,
+        datasets: Dict[str, Dataset],
+        worker_group: WorkerGroup,
+        start_time_ms: float,
+        status_detail: str = "",
+    ) -> None:
+        """Collect Train Run Info and report to StateActor."""
+        if not self.state_actor:
+            logger.warning(
+                "Unable to register train run since `TrainStateActor` is not started."
+            )
+            return
+        def collect_train_worker_info():
+            train_context = ray.train.get_context()
+            core_context = ray.runtime_context.get_runtime_context()
+            return TrainWorkerInfo(
+                world_rank=train_context.get_world_rank(),
+                local_rank=train_context.get_local_rank(),
+                node_rank=train_context.get_node_rank(),
+                actor_id=core_context.get_actor_id(),
+                node_id=core_context.get_node_id(),
+                node_ip=ray.util.get_node_ip_address(),
+                gpu_ids=ray.get_gpu_ids(),
+                pid=os.getpid(),
+            )
+        futures = [
+            worker_group.execute_single_async(index, collect_train_worker_info)
+            for index in range(len(worker_group))
+        ]
+        success, exception = check_for_failure(futures)
+        if not success:
+            logger.error(
+                "Failed to collect run information from the Ray Train "
+                f"workers:\n{exception}"
+            )
+            return
+        worker_info_list = ray.get(futures)
+        worker_info_list = sorted(worker_info_list, key=lambda info: info.world_rank)
+        dataset_info_list = [
+            TrainDatasetInfo(
+                name=ds_name,
+                dataset_name=ds._plan._dataset_name,
+                dataset_uuid=ds._plan._dataset_uuid,
+            )
+            for ds_name, ds in datasets.items()
+        ]
+        updates = dict(
+            id=run_id,
+            job_id=job_id,
+            name=run_name,
+            controller_actor_id=controller_actor_id,
+            workers=worker_info_list,
+            datasets=dataset_info_list,
+            start_time_ms=start_time_ms,
+            run_status=run_status,
+            status_detail=status_detail,
+        )
+        # Clear the cached info to avoid registering the same run twice
+        self.train_run_info_dict[run_id] = {}
+        self._update_train_run_info(run_id, updates)
+    def end_train_run(
+        self,
+        run_id: str,
+        run_status: RunStatusEnum,
+        status_detail: str,
+        end_time_ms: int,
+    ):
+        """Update the train run status when the training is finished."""
+        updates = dict(
+            run_status=run_status,
+            status_detail=status_detail,
+            end_time_ms=end_time_ms,
+        )
+        self._update_train_run_info(run_id, updates)
+    def _update_train_run_info(self, run_id: str, updates: Dict[str, Any]) -> None:
+        """Update specific fields of a registered TrainRunInfo instance."""
+        if run_id in self.train_run_info_dict:
+            self.train_run_info_dict[run_id].update(updates)
+            train_run_info = TrainRunInfo(**self.train_run_info_dict[run_id])
+            ray.get(self.state_actor.register_train_run.remote(train_run_info))

.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py ADDED Viewed

	@@ -0,0 +1,725 @@

+# Try import ray[train] core requirements (defined in setup.py)
+# isort: off
+try:
+    import fsspec  # noqa
+    from fsspec.implementations.local import LocalFileSystem
+except (ImportError, ModuleNotFoundError) as e:
+    raise RuntimeError(
+        "fsspec is a required dependency of Ray Train and Ray Tune. "
+        "Please install with: `pip install fsspec`"
+    ) from e
+try:
+    import pyarrow
+    import pyarrow.fs
+except (ImportError, ModuleNotFoundError) as e:
+    raise RuntimeError(
+        "pyarrow is a required dependency of Ray Train and Ray Tune. "
+        "Please install with: `pip install pyarrow`"
+    ) from e
+try:
+    # check if Arrow has S3 support
+    from pyarrow.fs import S3FileSystem
+except ImportError:
+    S3FileSystem = None
+# isort: on
+import fnmatch
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
+from ray.air._internal.filelock import TempFileLock
+from ray.train._internal.syncer import SyncConfig, Syncer, _BackgroundSyncer
+from ray.train.constants import _get_ray_train_session_dir
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    from ray.train._checkpoint import Checkpoint
+logger = logging.getLogger(__name__)
+_VALIDATE_STORAGE_MARKER_FILENAME = ".validate_storage_marker"
+class _ExcludingLocalFilesystem(LocalFileSystem):
+    """LocalFileSystem wrapper to exclude files according to patterns.
+    Args:
+        root_path: Root path to strip when matching with the exclude pattern.
+            Ex: root_path="/tmp/a/b/c", exclude=["*a*"], will exclude
+            /tmp/a/b/c/_a_.txt but not ALL of /tmp/a/*.
+        exclude: List of patterns that are applied to files returned by
+            ``self.find()``. If a file path matches this pattern, it will
+            be excluded.
+    """
+    def __init__(self, root_path: Path, exclude: List[str], **kwargs):
+        super().__init__(**kwargs)
+        self._exclude = exclude
+        self._root_path = root_path
+    @property
+    def fsid(self):
+        return "_excluding_local"
+    def _should_exclude(self, path: str) -> bool:
+        """Return True if `path` (relative to `root_path`) matches any of the
+        `self._exclude` patterns."""
+        path = Path(path)
+        relative_path = path.relative_to(self._root_path).as_posix()
+        match_candidates = [relative_path]
+        if path.is_dir():
+            # Everything is in posix path format ('/')
+            match_candidates.append(relative_path + "/")
+        for excl in self._exclude:
+            if any(fnmatch.fnmatch(candidate, excl) for candidate in match_candidates):
+                return True
+        return False
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        """Call parent find() and exclude from result."""
+        paths = super().find(
+            path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+        )
+        if detail:
+            return {
+                path: out
+                for path, out in paths.items()
+                if not self._should_exclude(path)
+            }
+        else:
+            return [path for path in paths if not self._should_exclude(path)]
+def _pyarrow_fs_copy_files(
+    source, destination, source_filesystem=None, destination_filesystem=None, **kwargs
+):
+    if S3FileSystem and isinstance(destination_filesystem, pyarrow.fs.S3FileSystem):
+        # Workaround multi-threading issue with pyarrow. Note that use_threads=True
+        # is safe for download, just not for uploads, see:
+        # https://github.com/apache/arrow/issues/32372
+        kwargs.setdefault("use_threads", False)
+    # Use a large chunk size to speed up large checkpoint transfers.
+    kwargs.setdefault("chunk_size", 64 * 1024 * 1024)
+    return pyarrow.fs.copy_files(
+        source,
+        destination,
+        source_filesystem=source_filesystem,
+        destination_filesystem=destination_filesystem,
+        **kwargs,
+    )
+# TODO(justinvyu): Add unit tests for all these utils.
+def _delete_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str):
+    is_dir = _is_directory(fs, fs_path)
+    try:
+        if is_dir:
+            fs.delete_dir(fs_path)
+        else:
+            fs.delete_file(fs_path)
+    except Exception:
+        logger.exception(f"Caught exception when deleting path at ({fs}, {fs_path}):")
+def _download_from_fs_path(
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    local_path: str,
+    filelock: bool = True,
+):
+    """Downloads a directory or file from (fs, fs_path) to a local path.
+    If fs_path points to a directory:
+    - The full directory contents are downloaded directly into `local_path`,
+      rather than to a subdirectory of `local_path`.
+    If fs_path points to a file:
+    - The file is downloaded to `local_path`, which is expected to be a file path.
+    If the download fails, the `local_path` contents are
+    cleaned up before raising, if the directory did not previously exist.
+    NOTE: This method creates `local_path`'s parent directories if they do not
+    already exist. If the download fails, this does NOT clean up all the parent
+    directories that were created.
+    Args:
+        fs: The filesystem to download from.
+        fs_path: The filesystem path (either a directory or a file) to download.
+        local_path: The local path to download to.
+        filelock: Whether to require a file lock before downloading, useful for
+            multiple downloads to the same directory that may be happening in parallel.
+    Raises:
+        FileNotFoundError: if (fs, fs_path) doesn't exist.
+    """
+    _local_path = Path(local_path).resolve()
+    exists_before = _local_path.exists()
+    if _is_directory(fs=fs, fs_path=fs_path):
+        _local_path.mkdir(parents=True, exist_ok=True)
+    else:
+        _local_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        if filelock:
+            with TempFileLock(f"{os.path.normpath(local_path)}.lock"):
+                _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
+        else:
+            _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
+    except Exception as e:
+        # Clean up the directory if downloading was unsuccessful
+        if not exists_before:
+            shutil.rmtree(local_path, ignore_errors=True)
+        raise e
+def _upload_to_fs_path(
+    local_path: str,
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    exclude: Optional[List[str]] = None,
+) -> None:
+    """Uploads a local directory or file to (fs, fs_path).
+    NOTE: This will create all necessary parent directories at the destination.
+    Args:
+        local_path: The local path to upload.
+        fs: The filesystem to upload to.
+        fs_path: The filesystem path where the dir/file will be uploaded to.
+        exclude: A list of filename matches to exclude from upload. This includes
+            all files under subdirectories as well.
+            This pattern will match with the relative paths of all files under
+            `local_path`.
+            Ex: ["*.png"] to exclude all .png images.
+    """
+    if not exclude:
+        # TODO(justinvyu): uploading a single file doesn't work
+        # (since we always create a directory at fs_path)
+        _create_directory(fs=fs, fs_path=fs_path)
+        _pyarrow_fs_copy_files(local_path, fs_path, destination_filesystem=fs)
+        return
+    _upload_to_uri_with_exclude_fsspec(
+        local_path=local_path, fs=fs, fs_path=fs_path, exclude=exclude
+    )
+def _upload_to_uri_with_exclude_fsspec(
+    local_path: str, fs: "pyarrow.fs", fs_path: str, exclude: Optional[List[str]]
+) -> None:
+    local_fs = _ExcludingLocalFilesystem(root_path=local_path, exclude=exclude)
+    handler = pyarrow.fs.FSSpecHandler(local_fs)
+    source_fs = pyarrow.fs.PyFileSystem(handler)
+    _create_directory(fs=fs, fs_path=fs_path)
+    _pyarrow_fs_copy_files(
+        local_path, fs_path, source_filesystem=source_fs, destination_filesystem=fs
+    )
+def _list_at_fs_path(
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    file_filter: Optional[Callable[[pyarrow.fs.FileInfo], bool]] = None,
+) -> List[str]:
+    """Returns the list of filenames at (fs, fs_path), similar to os.listdir.
+    If the path doesn't exist, returns an empty list.
+    """
+    if file_filter is None:
+        file_filter = lambda x: True  # noqa: E731
+    selector = pyarrow.fs.FileSelector(fs_path, allow_not_found=True, recursive=False)
+    return [
+        os.path.relpath(file_info.path.lstrip("/"), start=fs_path.lstrip("/"))
+        for file_info in fs.get_file_info(selector)
+        if file_filter(file_info)
+    ]
+def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool:
+    """Returns True if (fs, fs_path) exists."""
+    valid = fs.get_file_info(fs_path)
+    return valid.type != pyarrow.fs.FileType.NotFound
+def _is_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool:
+    """Checks if (fs, fs_path) is a directory or a file.
+    Raises:
+        FileNotFoundError: if (fs, fs_path) doesn't exist.
+    """
+    file_info = fs.get_file_info(fs_path)
+    if file_info.type == pyarrow.fs.FileType.NotFound:
+        raise FileNotFoundError(f"Path not found: ({fs}, {fs_path})")
+    return not file_info.is_file
+def _create_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> None:
+    """Create directory at (fs, fs_path).
+    Some external filesystems require directories to already exist, or at least
+    the `netloc` to be created (e.g. PyArrows ``mock://`` filesystem).
+    Generally this should be done before and outside of Ray applications. This
+    utility is thus primarily used in testing, e.g. of ``mock://` URIs.
+    """
+    try:
+        fs.create_dir(fs_path)
+    except Exception:
+        logger.exception(
+            f"Caught exception when creating directory at ({fs}, {fs_path}):"
+        )
+def get_fs_and_path(
+    storage_path: Union[str, os.PathLike],
+    storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+) -> Tuple[pyarrow.fs.FileSystem, str]:
+    """Returns the fs and path from a storage path and an optional custom fs.
+    Args:
+        storage_path: A storage path or URI. (ex: s3://bucket/path or /tmp/ray_results)
+        storage_filesystem: A custom filesystem to use. If not provided,
+            this will be auto-resolved by pyarrow. If provided, the storage_path
+            is assumed to be prefix-stripped already, and must be a valid path
+            on the filesystem.
+    """
+    storage_path = str(storage_path)
+    if storage_filesystem:
+        return storage_filesystem, storage_path
+    return pyarrow.fs.FileSystem.from_uri(storage_path)
+class _FilesystemSyncer(_BackgroundSyncer):
+    """Syncer between local filesystem and a `storage_filesystem`."""
+    def __init__(self, storage_filesystem: Optional["pyarrow.fs.FileSystem"], **kwargs):
+        self.storage_filesystem = storage_filesystem
+        super().__init__(**kwargs)
+    def _sync_up_command(
+        self, local_path: str, uri: str, exclude: Optional[List] = None
+    ) -> Tuple[Callable, Dict]:
+        # TODO(justinvyu): Defer this cleanup up as part of the
+        # external-facing Syncer deprecation.
+        fs_path = uri
+        return (
+            _upload_to_fs_path,
+            dict(
+                local_path=local_path,
+                fs=self.storage_filesystem,
+                fs_path=fs_path,
+                exclude=exclude,
+            ),
+        )
+    def _sync_down_command(self, uri: str, local_path: str) -> Tuple[Callable, Dict]:
+        fs_path = uri
+        return (
+            _download_from_fs_path,
+            dict(
+                fs=self.storage_filesystem,
+                fs_path=fs_path,
+                local_path=local_path,
+            ),
+        )
+    def _delete_command(self, uri: str) -> Tuple[Callable, Dict]:
+        fs_path = uri
+        return _delete_fs_path, dict(fs=self.storage_filesystem, fs_path=fs_path)
+@DeveloperAPI
+class StorageContext:
+    """Shared context that holds the source of truth for all paths and
+    storage utilities, passed along from the driver to workers.
+    This object defines a few types of paths:
+    1. *_fs_path: A path on the `storage_filesystem`. This is a regular path
+        which has been prefix-stripped by pyarrow.fs.FileSystem.from_uri and
+        can be joined with `Path(...).as_posix()`.
+    2. *_driver_staging_path: The temporary staging directory on the local filesystem
+        where driver artifacts are saved to before persisting them to storage.
+    3. trial_working_directory: The local filesystem path that the remote
+        actors' working directories are moved to by default.
+        This is separated from the driver staging path so that driver syncing
+        does not implicitly upload the trial working directory, for trials on the
+        driver node.
+    Example with storage_path="mock:///bucket/path?param=1":
+        >>> import ray
+        >>> from ray.train._internal.storage import StorageContext
+        >>> import os
+        >>> _ = ray.init()
+        >>> storage = StorageContext(
+        ...     storage_path="mock://netloc/bucket/path?param=1",
+        ...     experiment_dir_name="exp_name",
+        ... )
+        >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
+        <pyarrow._fs._MockFileSystem object...
+        >>> storage.experiment_fs_path
+        'bucket/path/exp_name'
+        >>> storage.experiment_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts'
+        >>> storage.trial_dir_name = "trial_dir"
+        >>> storage.trial_fs_path
+        'bucket/path/exp_name/trial_dir'
+        >>> storage.trial_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts/trial_dir'
+        >>> storage.trial_working_directory   # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/working_dirs/trial_dir'
+        >>> storage.current_checkpoint_index = 1
+        >>> storage.checkpoint_fs_path
+        'bucket/path/exp_name/trial_dir/checkpoint_000001'
+        >>> ray.shutdown()
+    Example with storage_path="/tmp/ray_results":
+        >>> from ray.train._internal.storage import StorageContext
+        >>> storage = StorageContext(
+        ...     storage_path="/tmp/ray_results",
+        ...     experiment_dir_name="exp_name",
+        ... )
+        >>> storage.storage_fs_path
+        '/tmp/ray_results'
+        >>> storage.experiment_fs_path
+        '/tmp/ray_results/exp_name'
+        >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
+        <pyarrow._fs.LocalFileSystem object...
+    Internal Usage Examples:
+    - To copy files to the trial directory on the storage filesystem:
+        pyarrow.fs.copy_files(
+            local_dir,
+            Path(storage.trial_fs_path, "subdir").as_posix(),
+            destination_filesystem=storage.filesystem
+        )
+    .. warning::
+        This is an experimental developer API and is subject to change
+        without notice between versions.
+    """
+    def __init__(
+        self,
+        storage_path: Union[str, os.PathLike],
+        experiment_dir_name: str,
+        sync_config: Optional[SyncConfig] = None,
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+        trial_dir_name: Optional[str] = None,
+        current_checkpoint_index: int = -1,
+    ):
+        from ray.tune.utils import date_str
+        self.custom_fs_provided = storage_filesystem is not None
+        # Invariant: (`storage_filesystem`, `storage_path`) is the location where
+        # *all* results can be accessed.
+        self.experiment_dir_name = experiment_dir_name
+        self.trial_dir_name = trial_dir_name
+        self.current_checkpoint_index = current_checkpoint_index
+        self.sync_config = sync_config or SyncConfig()
+        self.storage_filesystem, self.storage_fs_path = get_fs_and_path(
+            storage_path, storage_filesystem
+        )
+        self.storage_fs_path = Path(self.storage_fs_path).as_posix()
+        self.syncer: Syncer = _FilesystemSyncer(
+            storage_filesystem=self.storage_filesystem,
+            sync_period=self.sync_config.sync_period,
+            sync_timeout=self.sync_config.sync_timeout,
+        )
+        self._create_validation_file()
+        self._check_validation_file()
+        # Timestamp is used to create a unique session directory for the current
+        # training job. This is used to avoid conflicts when multiple training jobs
+        # run with the same name in the same cluster.
+        # This is set ONCE at the creation of the storage context, on the driver.
+        self._timestamp = date_str()
+    def __str__(self):
+        return (
+            "StorageContext<\n"
+            f"  storage_filesystem='{self.storage_filesystem.type_name}',\n"
+            f"  storage_fs_path='{self.storage_fs_path}',\n"
+            f"  experiment_dir_name='{self.experiment_dir_name}',\n"
+            f"  trial_dir_name='{self.trial_dir_name}',\n"
+            f"  current_checkpoint_index={self.current_checkpoint_index},\n"
+            ">"
+        )
+    def _create_validation_file(self):
+        """On the creation of a storage context, create a validation file at the
+        storage path to verify that the storage path can be written to.
+        This validation file is also used to check whether the storage path is
+        accessible by all nodes in the cluster."""
+        valid_file = Path(
+            self.experiment_fs_path, _VALIDATE_STORAGE_MARKER_FILENAME
+        ).as_posix()
+        self.storage_filesystem.create_dir(self.experiment_fs_path)
+        with self.storage_filesystem.open_output_stream(valid_file):
+            pass
+    def _check_validation_file(self):
+        """Checks that the validation file exists at the storage path."""
+        valid_file = Path(
+            self.experiment_fs_path, _VALIDATE_STORAGE_MARKER_FILENAME
+        ).as_posix()
+        if not _exists_at_fs_path(fs=self.storage_filesystem, fs_path=valid_file):
+            raise RuntimeError(
+                f"Unable to set up cluster storage with the following settings:\n{self}"
+                "\nCheck that all nodes in the cluster have read/write access "
+                "to the configured storage path. `RunConfig(storage_path)` should be "
+                "set to a cloud storage URI or a shared filesystem path accessible "
+                "by all nodes in your cluster ('s3://bucket' or '/mnt/nfs'). "
+                "A local path on the head node is not accessible by worker nodes. "
+                "See: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html"  # noqa: E501
+            )
+    def _update_checkpoint_index(self, metrics: Dict):
+        # Per default, increase by 1. This can be overwritten to customize checkpoint
+        # directories.
+        self.current_checkpoint_index += 1
+    def persist_current_checkpoint(self, checkpoint: "Checkpoint") -> "Checkpoint":
+        """Persists a given checkpoint to the current checkpoint path on the filesystem.
+        "Current" is defined by the `current_checkpoint_index` attribute of the
+        storage context.
+        This method copies the checkpoint files to the storage location.
+        It's up to the user to delete the original checkpoint files if desired.
+        For example, the original directory is typically a local temp directory.
+        Args:
+            checkpoint: The checkpoint to persist to (fs, checkpoint_fs_path).
+        Returns:
+            Checkpoint: A Checkpoint pointing to the persisted checkpoint location.
+        """
+        # TODO(justinvyu): Fix this cyclical import.
+        from ray.train._checkpoint import Checkpoint
+        logger.debug(
+            "Copying checkpoint files to storage path:\n"
+            "({source_fs}, {source}) -> ({dest_fs}, {destination})".format(
+                source=checkpoint.path,
+                destination=self.checkpoint_fs_path,
+                source_fs=checkpoint.filesystem,
+                dest_fs=self.storage_filesystem,
+            )
+        )
+        # Raise an error if the storage path is not accessible when
+        # attempting to upload a checkpoint from a remote worker.
+        # Ex: If storage_path is a local path, then a validation marker
+        # will only exist on the head node but not the worker nodes.
+        self._check_validation_file()
+        self.storage_filesystem.create_dir(self.checkpoint_fs_path)
+        _pyarrow_fs_copy_files(
+            source=checkpoint.path,
+            destination=self.checkpoint_fs_path,
+            source_filesystem=checkpoint.filesystem,
+            destination_filesystem=self.storage_filesystem,
+        )
+        persisted_checkpoint = Checkpoint(
+            filesystem=self.storage_filesystem,
+            path=self.checkpoint_fs_path,
+        )
+        logger.info(f"Checkpoint successfully created at: {persisted_checkpoint}")
+        return persisted_checkpoint
+    def persist_artifacts(self, force: bool = False) -> None:
+        """Persists all artifacts within `trial_local_dir` to storage.
+        This method possibly launches a background task to sync the trial dir,
+        depending on the `sync_period` + `sync_artifacts_on_checkpoint`
+        settings of `SyncConfig`.
+        `(local_fs, trial_working_dir) -> (storage_filesystem, trial_fs_path)`
+        Args:
+            force: If True, wait for a previous sync to finish, launch a new one,
+                and wait for that one to finish. By the end of a `force=True` call, the
+                latest version of the trial artifacts will be persisted.
+        """
+        if not self.sync_config.sync_artifacts:
+            return
+        # Skip if there are no artifacts to sync
+        is_empty = not any(os.scandir(self.trial_working_directory))
+        if is_empty:
+            return
+        if force:
+            self.syncer.wait()
+            self.syncer.sync_up(
+                local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path
+            )
+            self.syncer.wait()
+        else:
+            self.syncer.sync_up_if_needed(
+                local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path
+            )
+    @property
+    def experiment_fs_path(self) -> str:
+        """The path on the `storage_filesystem` to the experiment directory.
+        NOTE: This does not have a URI prefix anymore, since it has been stripped
+        by pyarrow.fs.FileSystem.from_uri already. The URI scheme information is
+        kept in `storage_filesystem` instead.
+        """
+        return Path(self.storage_fs_path, self.experiment_dir_name).as_posix()
+    def _get_session_path(self) -> str:
+        """The Ray Train/Tune session local directory used to stage files
+        before persisting to the storage filesystem."""
+        return Path(
+            _get_ray_train_session_dir(), self._timestamp, self.experiment_dir_name
+        ).as_posix()
+    @property
+    def experiment_driver_staging_path(self) -> str:
+        """The local filesystem path of the experiment directory on the driver node.
+        The driver is the node where `Trainer.fit`/`Tuner.fit` is being called.
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/driver_artifacts`
+        This should be used as the temporary staging location for files *on the driver*
+        before syncing them to `experiment_fs_path`.
+        For example, the search algorithm should dump its state to this directory.
+        See `trial_driver_staging_path` for writing trial-specific artifacts.
+        The directory is synced to
+        `{storage_path}/{experiment_dir_name}` periodically.
+        See `_ExperimentCheckpointManager.checkpoint` for where that happens.
+        """
+        return Path(self._get_session_path(), "driver_artifacts").as_posix()
+    @property
+    def trial_fs_path(self) -> str:
+        """The trial directory path on the `storage_filesystem`.
+        Raises a ValueError if `trial_dir_name` is not set beforehand.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Should not access `trial_fs_path` without setting `trial_dir_name`"
+            )
+        return Path(self.experiment_fs_path, self.trial_dir_name).as_posix()
+    @property
+    def trial_driver_staging_path(self) -> str:
+        """The local filesystem path of the trial directory on the driver.
+        The driver is the node where `Trainer.fit`/`Tuner.fit` is being called.
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/driver_artifacts/<trial_dir_name>`
+        This should be used as the temporary location for files on the driver
+        before persisting them to `trial_fs_path`.
+        For example, callbacks (e.g., JsonLoggerCallback) should write trial-specific
+        logfiles within this directory.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Should not access `trial_driver_staging_path` "
+                "without setting `trial_dir_name`"
+            )
+        return Path(self.experiment_driver_staging_path, self.trial_dir_name).as_posix()
+    @property
+    def trial_working_directory(self) -> str:
+        """The local filesystem path to trial working directory.
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/working_dirs/<trial_dir_name>`
+        Ray Train/Tune moves the remote actor's working directory to this path
+        by default, unless disabled by `RAY_CHDIR_TO_TRIAL_DIR` environment variable.
+        Writing files to this directory allows users to persist training artifacts
+        if `SyncConfig(sync_artifacts=True)` is set.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Cannot access `trial_working_directory` without "
+                "setting `trial_dir_name`"
+            )
+        return Path(
+            self._get_session_path(), "working_dirs", self.trial_dir_name
+        ).as_posix()
+    @property
+    def checkpoint_fs_path(self) -> str:
+        """The current checkpoint directory path on the `storage_filesystem`.
+        "Current" refers to the checkpoint that is currently being created/persisted.
+        The user of this class is responsible for setting the `current_checkpoint_index`
+        (e.g., incrementing when needed).
+        """
+        return Path(self.trial_fs_path, self.checkpoint_dir_name).as_posix()
+    @property
+    def checkpoint_dir_name(self) -> str:
+        """The current checkpoint directory name, based on the checkpoint index."""
+        return StorageContext._make_checkpoint_dir_name(self.current_checkpoint_index)
+    @staticmethod
+    def get_experiment_dir_name(run_obj: Union[str, Callable, Type]) -> str:
+        from ray.tune.experiment import Experiment
+        from ray.tune.utils import date_str
+        run_identifier = Experiment.get_trainable_name(run_obj)
+        if bool(int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0))):
+            dir_name = run_identifier
+        else:
+            dir_name = "{}_{}".format(run_identifier, date_str())
+        return dir_name
+    @staticmethod
+    def _make_checkpoint_dir_name(index: int):
+        """Get the name of the checkpoint directory, given an index."""
+        return f"checkpoint_{index:06d}"

.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import abc
+import logging
+import threading
+import time
+import traceback
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from ray._private.thirdparty.tabulate.tabulate import tabulate
+from ray.train.constants import _DEPRECATED_VALUE
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.widgets import Template
+logger = logging.getLogger(__name__)
+# Syncing period for syncing checkpoints between nodes or to cloud.
+DEFAULT_SYNC_PERIOD = 300
+# Default sync timeout after which syncing processes are aborted
+DEFAULT_SYNC_TIMEOUT = 1800
+@PublicAPI(stability="stable")
+@dataclass
+class SyncConfig:
+    """Configuration object for Train/Tune file syncing to `RunConfig(storage_path)`.
+    In Ray Train/Tune, here is where syncing (mainly uploading) happens:
+    The experiment driver (on the head node) syncs the experiment directory to storage
+    (which includes experiment state such as searcher state, the list of trials
+    and their statuses, and trial metadata).
+    It's also possible to sync artifacts from the trial directory to storage
+    by setting `sync_artifacts=True`.
+    For a Ray Tune run with many trials, each trial will upload its trial directory
+    to storage, which includes arbitrary files that you dumped during the run.
+    For a Ray Train run doing distributed training, each remote worker will similarly
+    upload its trial directory to storage.
+    See :ref:`persistent-storage-guide` for more details and examples.
+    Args:
+        sync_period: Minimum time in seconds to wait between two sync operations.
+            A smaller ``sync_period`` will have the data in storage updated more often
+            but introduces more syncing overhead. Defaults to 5 minutes.
+        sync_timeout: Maximum time in seconds to wait for a sync process
+            to finish running. A sync operation will run for at most this long
+            before raising a `TimeoutError`. Defaults to 30 minutes.
+        sync_artifacts: [Beta] Whether or not to sync artifacts that are saved to the
+            trial directory (accessed via `train.get_context().get_trial_dir()`)
+            to the persistent storage configured via `train.RunConfig(storage_path)`.
+            The trial or remote worker will try to launch an artifact syncing
+            operation every time `train.report` happens, subject to `sync_period`
+            and `sync_artifacts_on_checkpoint`.
+            Defaults to False -- no artifacts are persisted by default.
+        sync_artifacts_on_checkpoint: If True, trial/worker artifacts are
+            forcefully synced on every reported checkpoint.
+            This only has an effect if `sync_artifacts` is True.
+            Defaults to True.
+    """
+    sync_period: int = DEFAULT_SYNC_PERIOD
+    sync_timeout: int = DEFAULT_SYNC_TIMEOUT
+    sync_artifacts: bool = False
+    sync_artifacts_on_checkpoint: bool = True
+    upload_dir: Optional[str] = _DEPRECATED_VALUE
+    syncer: Optional[Union[str, "Syncer"]] = _DEPRECATED_VALUE
+    sync_on_checkpoint: bool = _DEPRECATED_VALUE
+    # TODO(justinvyu): [Deprecated] Remove in 2.11.
+    def _deprecation_warning(self, attr_name: str, extra_msg: str):
+        if getattr(self, attr_name) != _DEPRECATED_VALUE:
+            raise DeprecationWarning(
+                f"`SyncConfig({attr_name})` is a deprecated configuration "
+                "Please remove it from your `SyncConfig`. "
+                f"{extra_msg}"
+            )
+    def __post_init__(self):
+        for attr_name, extra_msg in [
+            (
+                "upload_dir",
+                "\nPlease specify `ray.train.RunConfig(storage_path)` instead.",
+            ),
+            (
+                "syncer",
+                "\nPlease implement custom syncing logic with a custom "
+                "`pyarrow.fs.FileSystem` instead, and pass it into "
+                "`ray.train.RunConfig(storage_filesystem)`. "
+                "See here: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html#custom-storage",  # noqa: E501
+            ),
+            ("sync_on_checkpoint", ""),
+        ]:
+            self._deprecation_warning(attr_name, extra_msg)
+    def _repr_html_(self) -> str:
+        """Generate an HTML representation of the SyncConfig."""
+        return Template("scrollableTable.html.j2").render(
+            table=tabulate(
+                {
+                    "Setting": ["Sync period", "Sync timeout"],
+                    "Value": [self.sync_period, self.sync_timeout],
+                },
+                tablefmt="html",
+                showindex=False,
+                headers="keys",
+            ),
+            max_height="none",
+        )
+class _BackgroundProcess:
+    def __init__(self, fn: Callable):
+        self._fn = fn
+        self._process = None
+        self._result = {}
+        self._start_time = float("-inf")
+    @property
+    def is_running(self):
+        return self._process and self._process.is_alive()
+    @property
+    def start_time(self):
+        return self._start_time
+    def start(self, *args, **kwargs):
+        if self.is_running:
+            return False
+        self._result = {}
+        def entrypoint():
+            try:
+                result = self._fn(*args, **kwargs)
+            except Exception as e:
+                self._result["exception"] = e
+                return
+            self._result["result"] = result
+        self._process = threading.Thread(target=entrypoint)
+        self._process.daemon = True
+        self._process.start()
+        self._start_time = time.time()
+    def wait(self, timeout: Optional[float] = None) -> Any:
+        """Waits for the background process to finish running. Waits until the
+        background process has run for at least `timeout` seconds, counting from
+        the time when the process was started."""
+        if not self._process:
+            return None
+        time_remaining = None
+        if timeout:
+            elapsed = time.time() - self.start_time
+            time_remaining = max(timeout - elapsed, 0)
+        self._process.join(timeout=time_remaining)
+        if self._process.is_alive():
+            self._process = None
+            raise TimeoutError(
+                f"{getattr(self._fn, '__name__', str(self._fn))} did not finish "
+                f"running within the timeout of {timeout} seconds."
+            )
+        self._process = None
+        exception = self._result.get("exception")
+        if exception:
+            raise exception
+        result = self._result.get("result")
+        self._result = {}
+        return result
+@DeveloperAPI
+class Syncer(abc.ABC):
+    """Syncer class for synchronizing data between Ray nodes and remote (cloud) storage.
+    This class handles data transfer for two cases:
+    1. Synchronizing data such as experiment state snapshots from the driver to
+       cloud storage.
+    2. Synchronizing data such as trial checkpoints from remote trainables to
+       cloud storage.
+    Synchronizing tasks are usually asynchronous and can be awaited using ``wait()``.
+    The base class implements a ``wait_or_retry()`` API that will retry a failed
+    sync command.
+    The base class also exposes an API to only kick off syncs every ``sync_period``
+    seconds.
+    Args:
+        sync_period: The minimum time in seconds between sync operations, as
+            used by ``sync_up/down_if_needed``.
+        sync_timeout: The maximum time to wait for a sync process to finish before
+            issuing a new sync operation. Ex: should be used by ``wait`` if launching
+            asynchronous sync tasks.
+    """
+    def __init__(
+        self,
+        sync_period: float = DEFAULT_SYNC_PERIOD,
+        sync_timeout: float = DEFAULT_SYNC_TIMEOUT,
+    ):
+        self.sync_period = sync_period
+        self.sync_timeout = sync_timeout
+        self.last_sync_up_time = float("-inf")
+        self.last_sync_down_time = float("-inf")
+    @abc.abstractmethod
+    def sync_up(
+        self, local_dir: str, remote_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        """Synchronize local directory to remote directory.
+        This function can spawn an asynchronous process that can be awaited in
+        ``wait()``.
+        Args:
+            local_dir: Local directory to sync from.
+            remote_dir: Remote directory to sync up to. This is an URI
+                (``protocol://remote/path``).
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+        Returns:
+            True if sync process has been spawned, False otherwise.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def sync_down(
+        self, remote_dir: str, local_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        """Synchronize remote directory to local directory.
+        This function can spawn an asynchronous process that can be awaited in
+        ``wait()``.
+        Args:
+            remote_dir: Remote directory to sync down from. This is an URI
+                (``protocol://remote/path``).
+            local_dir: Local directory to sync to.
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+        Returns:
+            True if sync process has been spawned, False otherwise.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def delete(self, remote_dir: str) -> bool:
+        """Delete directory on remote storage.
+        This function can spawn an asynchronous process that can be awaited in
+        ``wait()``.
+        Args:
+            remote_dir: Remote directory to delete. This is an URI
+                (``protocol://remote/path``).
+        Returns:
+            True if sync process has been spawned, False otherwise.
+        """
+        raise NotImplementedError
+    def retry(self):
+        """Retry the last sync up, sync down, or delete command.
+        You should implement this method if you spawn asynchronous syncing
+        processes.
+        """
+        pass
+    def wait(self, timeout: Optional[float] = None):
+        """Wait for asynchronous sync command to finish.
+        You should implement this method if you spawn asynchronous syncing
+        processes. This method should timeout after the asynchronous command
+        has run for `sync_timeout` seconds and raise a `TimeoutError`.
+        """
+        pass
+    def sync_up_if_needed(
+        self, local_dir: str, remote_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        """Syncs up if time since last sync up is greater than sync_period.
+        Args:
+            local_dir: Local directory to sync from.
+            remote_dir: Remote directory to sync up to. This is an URI
+                (``protocol://remote/path``).
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+        """
+        now = time.time()
+        if now - self.last_sync_up_time >= self.sync_period:
+            result = self.sync_up(
+                local_dir=local_dir, remote_dir=remote_dir, exclude=exclude
+            )
+            self.last_sync_up_time = now
+            return result
+    def sync_down_if_needed(
+        self, remote_dir: str, local_dir: str, exclude: Optional[List] = None
+    ):
+        """Syncs down if time since last sync down is greater than sync_period.
+        Args:
+            remote_dir: Remote directory to sync down from. This is an URI
+                (``protocol://remote/path``).
+            local_dir: Local directory to sync to.
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+        """
+        now = time.time()
+        if now - self.last_sync_down_time >= self.sync_period:
+            result = self.sync_down(
+                remote_dir=remote_dir, local_dir=local_dir, exclude=exclude
+            )
+            self.last_sync_down_time = now
+            return result
+    def wait_or_retry(self, max_retries: int = 2, backoff_s: int = 5):
+        assert max_retries > 0
+        last_error_traceback = None
+        for i in range(max_retries + 1):
+            try:
+                self.wait()
+            except Exception as e:
+                attempts_remaining = max_retries - i
+                # If we're out of retries, then save the full traceback of the last
+                # error and show it when raising an exception.
+                if attempts_remaining == 0:
+                    last_error_traceback = traceback.format_exc()
+                    break
+                logger.error(
+                    f"The latest sync operation failed with the following error: "
+                    f"{repr(e)}\n"
+                    f"Retrying {attempts_remaining} more time(s) after sleeping "
+                    f"for {backoff_s} seconds..."
+                )
+                time.sleep(backoff_s)
+                self.retry()
+                continue
+            # Succeeded!
+            return
+        raise RuntimeError(
+            f"Failed sync even after {max_retries} retries. "
+            f"The latest sync failed with the following error:\n{last_error_traceback}"
+        )
+    def reset(self):
+        self.last_sync_up_time = float("-inf")
+        self.last_sync_down_time = float("-inf")
+    def close(self):
+        pass
+    def _repr_html_(self) -> str:
+        return
+class _BackgroundSyncer(Syncer):
+    """Syncer using a background process for asynchronous file transfer."""
+    def __init__(
+        self,
+        sync_period: float = DEFAULT_SYNC_PERIOD,
+        sync_timeout: float = DEFAULT_SYNC_TIMEOUT,
+    ):
+        super(_BackgroundSyncer, self).__init__(
+            sync_period=sync_period, sync_timeout=sync_timeout
+        )
+        self._sync_process = None
+        self._current_cmd = None
+    def _should_continue_existing_sync(self):
+        """Returns whether a previous sync is still running within the timeout."""
+        return (
+            self._sync_process
+            and self._sync_process.is_running
+            and time.time() - self._sync_process.start_time < self.sync_timeout
+        )
+    def _launch_sync_process(self, sync_command: Tuple[Callable, Dict]):
+        """Waits for the previous sync process to finish,
+        then launches a new process that runs the given command."""
+        if self._sync_process:
+            try:
+                self.wait()
+            except Exception:
+                logger.warning(
+                    f"Last sync command failed with the following error:\n"
+                    f"{traceback.format_exc()}"
+                )
+        self._current_cmd = sync_command
+        self.retry()
+    def sync_up(
+        self, local_dir: str, remote_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        if self._should_continue_existing_sync():
+            logger.debug(
+                f"Last sync still in progress, "
+                f"skipping sync up of {local_dir} to {remote_dir}"
+            )
+            return False
+        sync_up_cmd = self._sync_up_command(
+            local_path=local_dir, uri=remote_dir, exclude=exclude
+        )
+        self._launch_sync_process(sync_up_cmd)
+        return True
+    def _sync_up_command(
+        self, local_path: str, uri: str, exclude: Optional[List] = None
+    ) -> Tuple[Callable, Dict]:
+        raise NotImplementedError
+    def sync_down(
+        self, remote_dir: str, local_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        if self._should_continue_existing_sync():
+            logger.warning(
+                f"Last sync still in progress, "
+                f"skipping sync down of {remote_dir} to {local_dir}"
+            )
+            return False
+        sync_down_cmd = self._sync_down_command(uri=remote_dir, local_path=local_dir)
+        self._launch_sync_process(sync_down_cmd)
+        return True
+    def _sync_down_command(self, uri: str, local_path: str) -> Tuple[Callable, Dict]:
+        raise NotImplementedError
+    def delete(self, remote_dir: str) -> bool:
+        if self._should_continue_existing_sync():
+            logger.warning(
+                f"Last sync still in progress, skipping deletion of {remote_dir}"
+            )
+            return False
+        delete_cmd = self._delete_command(uri=remote_dir)
+        self._launch_sync_process(delete_cmd)
+        return True
+    def _delete_command(self, uri: str) -> Tuple[Callable, Dict]:
+        raise NotImplementedError
+    def wait(self, timeout: Optional[float] = None):
+        if self._sync_process:
+            try:
+                self._sync_process.wait(timeout=timeout or self.sync_timeout)
+            except Exception as e:
+                raise e
+            finally:
+                # Regardless of whether the sync process succeeded within the timeout,
+                # clear the sync process so a new one can be created.
+                self._sync_process = None
+    def retry(self):
+        if not self._current_cmd:
+            raise RuntimeError("No sync command set, cannot retry.")
+        cmd, kwargs = self._current_cmd
+        self._sync_process = _BackgroundProcess(cmd)
+        self._sync_process.start(**kwargs)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["_sync_process"] = None
+        return state

.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import abc
+import functools
+import inspect
+import logging
+import os
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import ray
+from ray.actor import ActorHandle
+from ray.air._internal.util import (
+    StartTraceback,
+    StartTracebackWithWorkerRank,
+    find_free_port,
+)
+from ray.exceptions import RayActorError
+from ray.types import ObjectRef
+T = TypeVar("T")
+logger = logging.getLogger(__name__)
+def check_for_failure(
+    remote_values: List[ObjectRef],
+) -> Tuple[bool, Optional[Exception]]:
+    """Check for actor failure when retrieving the remote values.
+    Args:
+        remote_values: List of object references from Ray actor methods.
+    Returns:
+        A tuple of (bool, Exception). The bool is
+        True if evaluating all object references is successful, False otherwise.
+    """
+    unfinished = remote_values.copy()
+    while len(unfinished) > 0:
+        finished, unfinished = ray.wait(unfinished)
+        # If a failure occurs the ObjectRef will be marked as finished.
+        # Calling ray.get will expose the failure as a RayActorError.
+        for object_ref in finished:
+            # Everything in finished has either failed or completed
+            # successfully.
+            try:
+                ray.get(object_ref)
+            except RayActorError as exc:
+                failed_actor_rank = remote_values.index(object_ref)
+                logger.info(f"Worker {failed_actor_rank} has failed.")
+                return False, exc
+            except Exception as exc:
+                # Other (e.g. training) errors should be directly raised
+                failed_worker_rank = remote_values.index(object_ref)
+                raise StartTracebackWithWorkerRank(
+                    worker_rank=failed_worker_rank
+                ) from exc
+    return True, None
+def get_address_and_port() -> Tuple[str, int]:
+    """Returns the IP address and a free port on this node."""
+    addr = ray.util.get_node_ip_address()
+    port = find_free_port()
+    return addr, port
+def construct_path(path: Path, parent_path: Path) -> Path:
+    """Constructs a path relative to a parent.
+    Args:
+        path: A relative or absolute path.
+        parent_path: A relative path or absolute path.
+    Returns: An absolute path.
+    """
+    if path.expanduser().is_absolute():
+        return path.expanduser().resolve()
+    else:
+        return parent_path.joinpath(path).expanduser().resolve()
+def update_env_vars(env_vars: Dict[str, Any]):
+    """Updates the environment variables on this worker process.
+    Args:
+        env_vars: Environment variables to set.
+    """
+    sanitized = {k: str(v) for k, v in env_vars.items()}
+    os.environ.update(sanitized)
+def count_required_parameters(fn: Callable) -> int:
+    """Counts the number of required parameters of a function.
+    NOTE: *args counts as 1 required parameter.
+    Examples
+    --------
+    >>> def fn(a, b, /, c, *args, d=1, e=2, **kwargs):
+    ...    pass
+    >>> count_required_parameters(fn)
+    4
+    >>> fn = lambda: 1
+    >>> count_required_parameters(fn)
+    0
+    >>> def fn(config, a, b=1, c=2):
+    ...     pass
+    >>> from functools import partial
+    >>> count_required_parameters(partial(fn, a=0))
+    1
+    """
+    params = inspect.signature(fn).parameters.values()
+    positional_param_kinds = {
+        inspect.Parameter.POSITIONAL_ONLY,
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.VAR_POSITIONAL,
+    }
+    return len(
+        [
+            p
+            for p in params
+            if p.default == inspect.Parameter.empty and p.kind in positional_param_kinds
+        ]
+    )
+def construct_train_func(
+    train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
+    config: Optional[Dict[str, Any]],
+    train_func_context: ContextManager,
+    fn_arg_name: Optional[str] = "train_func",
+    discard_returns: bool = False,
+) -> Callable[[], T]:
+    """Validates and constructs the training function to execute.
+    Args:
+        train_func: The training function to execute.
+            This can either take in no arguments or a ``config`` dict.
+        config (Optional[Dict]): Configurations to pass into
+            ``train_func``. If None then an empty Dict will be created.
+        train_func_context: Context manager for user's `train_func`, which executes
+            backend-specific logic before and after the training function.
+        fn_arg_name (Optional[str]): The name of training function to use for error
+            messages.
+        discard_returns: Whether to discard any returns from train_func or not.
+    Returns:
+        A valid training function.
+    Raises:
+        ValueError: if the input ``train_func`` is invalid.
+    """
+    num_required_params = count_required_parameters(train_func)
+    if discard_returns:
+        # Discard any returns from the function so that
+        # BackendExecutor doesn't try to deserialize them.
+        # Those returns are inaccesible with AIR anyway.
+        @functools.wraps(train_func)
+        def discard_return_wrapper(*args, **kwargs):
+            try:
+                train_func(*args, **kwargs)
+            except Exception as e:
+                raise StartTraceback from e
+        wrapped_train_func = discard_return_wrapper
+    else:
+        wrapped_train_func = train_func
+    if num_required_params > 1:
+        err_msg = (
+            f"{fn_arg_name} should take in 0 or 1 required arguments, but it accepts "
+            f"{num_required_params} required arguments instead."
+        )
+        raise ValueError(err_msg)
+    elif num_required_params == 1:
+        config = {} if config is None else config
+        @functools.wraps(wrapped_train_func)
+        def train_fn():
+            try:
+                with train_func_context():
+                    return wrapped_train_func(config)
+            except Exception as e:
+                raise StartTraceback from e
+    else:  # num_params == 0
+        @functools.wraps(wrapped_train_func)
+        def train_fn():
+            try:
+                with train_func_context():
+                    return wrapped_train_func()
+            except Exception as e:
+                raise StartTraceback from e
+    return train_fn
+class Singleton(abc.ABCMeta):
+    """Singleton Abstract Base Class
+    https://stackoverflow.com/questions/33364070/implementing
+    -singleton-as-metaclass-but-for-abstract-classes
+    """
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+class ActorWrapper:
+    """Wraps an actor to provide same API as using the base class directly."""
+    def __init__(self, actor: ActorHandle):
+        self.actor = actor
+    def __getattr__(self, item):
+        # The below will fail if trying to access an attribute (not a method) from the
+        # actor.
+        actor_method = getattr(self.actor, item)
+        return lambda *args, **kwargs: ray.get(actor_method.remote(*args, **kwargs))

.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import logging
+import os
+import socket
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
+import ray
+from ray.actor import ActorHandle
+from ray.air._internal.util import exception_cause, skip_exceptions
+from ray.types import ObjectRef
+from ray.util.placement_group import PlacementGroup
+T = TypeVar("T")
+logger = logging.getLogger(__name__)
+class RayTrainWorker:
+    """A class to execute arbitrary functions. Does not hold any state."""
+    def __execute(self, func: Callable[..., T], *args, **kwargs) -> T:
+        """Executes the input function and returns the output.
+        Args:
+            func: The function to execute.
+            args, kwargs: The arguments to pass into func.
+        """
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            skipped = skip_exceptions(e)
+            raise skipped from exception_cause(skipped)
+@dataclass
+class WorkerMetadata:
+    """Metadata for each worker/actor.
+    This information is expected to stay the same throughout the lifetime of
+    actor.
+    Args:
+        node_id: ID of the node this worker is on.
+        node_ip: IP address of the node this worker is on.
+        hostname: Hostname that this worker is on.
+        resource_ids: Map of accelerator resources
+        ("GPU", "neuron_cores", ..) to their IDs.
+        pid: Process ID of this worker.
+    """
+    node_id: str
+    node_ip: str
+    hostname: str
+    resource_ids: Dict[str, List[str]]
+    pid: int
+@dataclass
+class Worker:
+    """Class representing a Worker."""
+    actor: ActorHandle
+    metadata: WorkerMetadata
+def create_executable_class(executable_cls: Optional[Type] = None) -> Type:
+    """Create the executable class to use as the Ray actors."""
+    if not executable_cls:
+        return RayTrainWorker
+    elif issubclass(executable_cls, RayTrainWorker):
+        return executable_cls
+    else:
+        class _WrappedExecutable(executable_cls, RayTrainWorker):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+        return _WrappedExecutable
+def construct_metadata() -> WorkerMetadata:
+    """Creates metadata for this worker.
+    This function is expected to be run on the actor.
+    """
+    node_id = ray.get_runtime_context().get_node_id()
+    node_ip = ray.util.get_node_ip_address()
+    hostname = socket.gethostname()
+    accelerator_ids = ray.get_runtime_context().get_accelerator_ids()
+    pid = os.getpid()
+    return WorkerMetadata(
+        node_id=node_id,
+        node_ip=node_ip,
+        hostname=hostname,
+        resource_ids=accelerator_ids,
+        pid=pid,
+    )
+class WorkerGroup:
+    """Group of Ray Actors that can execute arbitrary functions.
+    ``WorkerGroup`` launches Ray actors according to the given
+    specification. It can then execute arbitrary Python functions in each of
+    these workers.
+    If not enough resources are available to launch the actors, the Ray
+    cluster will automatically scale up if autoscaling is enabled.
+    Args:
+        num_workers: The number of workers (Ray actors) to launch.
+            Defaults to 1.
+        resources_per_worker (Optional[Dict[str, float]]):
+            Dictionary specifying the resources that will be
+            requested for each worker. Defaults to {"CPU": 1}.
+        actor_cls (Optional[Type]): If specified use this class as the
+            remote actors.
+        remote_cls_args, remote_cls_kwargs: If ``remote_cls`` is provided,
+            these args will be used for the worker initialization.
+        placement_group (PlacementGroup|str): The placement group that workers
+            should be created in. Defaults to "default" which will inherit the
+            parent placement group (if child tasks should be captured).
+    Example:
+    .. code_block:: python
+        worker_group = WorkerGroup(num_workers=2)
+        output = worker_group.execute(lambda: 1)
+        assert len(output) == 2
+        assert all(o == 1 for o in output)
+    """
+    def __init__(
+        self,
+        num_workers: int = 1,
+        resources_per_worker: Optional[Dict[str, float]] = None,
+        actor_cls: Type = None,
+        actor_cls_args: Optional[Tuple] = None,
+        actor_cls_kwargs: Optional[Dict] = None,
+        placement_group: Union[PlacementGroup, str] = "default",
+    ):
+        if resources_per_worker is None:
+            resources_per_worker = {"CPU": 1}
+        else:
+            resources_per_worker = resources_per_worker.copy()
+        if num_workers <= 0:
+            raise ValueError(
+                "The provided `num_workers` must be greater "
+                f"than 0. Received num_workers={num_workers} "
+                f"instead."
+            )
+        if any(v < 0 for v in resources_per_worker.values()):
+            raise ValueError(
+                "The number of resources per worker must not be negative. "
+                f"Received resources_per_worker={resources_per_worker}."
+            )
+        if (actor_cls_args or actor_cls_kwargs) and not actor_cls:
+            raise ValueError(
+                "`actor_cls_args` or `actor_class_kwargs` are "
+                "passed in but no `actor_cls` is passed in."
+            )
+        self.num_workers = num_workers
+        self.num_cpus_per_worker = resources_per_worker.pop("CPU", 0)
+        self.num_gpus_per_worker = resources_per_worker.pop("GPU", 0)
+        self.memory_per_worker = resources_per_worker.pop("memory", 0)
+        self.workers = []
+        self._base_cls = create_executable_class(actor_cls)
+        assert issubclass(self._base_cls, RayTrainWorker)
+        self._actor_cls_args = actor_cls_args or []
+        self._actor_cls_kwargs = actor_cls_kwargs or {}
+        self._placement_group = placement_group
+        # TODO(matt): Validate resources. Fast-fail if it is impossible to
+        #  handle the request, rather than hang indefinitely.
+        self._remote_cls = ray.remote(
+            num_cpus=self.num_cpus_per_worker,
+            num_gpus=self.num_gpus_per_worker,
+            memory=self.memory_per_worker,
+            resources=resources_per_worker,
+        )(self._base_cls)
+        self.start()
+    def start(self):
+        """Starts all the workers in this worker group."""
+        if self.workers and len(self.workers) > 0:
+            raise RuntimeError(
+                "The workers have already been started. "
+                "Please call `shutdown` first if you want to "
+                "restart them."
+            )
+        logger.debug(f"Starting {self.num_workers} workers.")
+        self.add_workers(self.num_workers)
+        logger.debug(f"{len(self.workers)} workers have successfully started.")
+    def shutdown(self, patience_s: float = 5):
+        """Shutdown all the workers in this worker group.
+        Args:
+            patience_s: Attempt a graceful shutdown
+                of the workers for this many seconds. Fallback to force kill
+                if graceful shutdown is not complete after this time. If
+                this is less than or equal to 0, immediately force kill all
+                workers.
+        """
+        logger.debug(f"Shutting down {len(self.workers)} workers.")
+        if patience_s <= 0:
+            for worker in self.workers:
+                ray.kill(worker.actor)
+        else:
+            done_refs = [w.actor.__ray_terminate__.remote() for w in self.workers]
+            # Wait for actors to die gracefully.
+            done, not_done = ray.wait(done_refs, timeout=patience_s)
+            if not_done:
+                logger.debug("Graceful termination failed. Falling back to force kill.")
+                # If all actors are not able to die gracefully, then kill them.
+                for worker in self.workers:
+                    ray.kill(worker.actor)
+        logger.debug("Shutdown successful.")
+        self.workers = []
+    def execute_async(self, func: Callable[..., T], *args, **kwargs) -> List[ObjectRef]:
+        """Execute ``func`` on each worker and return the futures.
+        Args:
+            func: A function to call on each worker.
+            args, kwargs: Passed directly into func.
+        Returns:
+            (List[ObjectRef]) A list of ``ObjectRef`` representing the
+                output of ``func`` from each worker. The order is the same
+                as ``self.workers``.
+        """
+        if len(self.workers) <= 0:
+            raise RuntimeError(
+                "There are no active workers. This worker "
+                "group has most likely been shut down. Please"
+                "create a new WorkerGroup or restart this one."
+            )
+        return [
+            w.actor._RayTrainWorker__execute.options(
+                name=f"_RayTrainWorker__execute.{func.__name__}"
+            ).remote(func, *args, **kwargs)
+            for w in self.workers
+        ]
+    def execute(self, func: Callable[..., T], *args, **kwargs) -> List[T]:
+        """Execute ``func`` on each worker and return the outputs of ``func``.
+        Args:
+            func: A function to call on each worker.
+            args, kwargs: Passed directly into func.
+        Returns:
+            (List[T]) A list containing the output of ``func`` from each
+                worker. The order is the same as ``self.workers``.
+        """
+        return ray.get(self.execute_async(func, *args, **kwargs))
+    def execute_single_async(
+        self, worker_index: int, func: Callable[..., T], *args, **kwargs
+    ) -> ObjectRef:
+        """Execute ``func`` on worker ``worker_index`` and return futures.
+        Args:
+            worker_index: The index to execute func on.
+            func: A function to call on the first worker.
+            args, kwargs: Passed directly into func.
+        Returns:
+            (ObjectRef) An ObjectRef representing the output of func.
+        """
+        if worker_index >= len(self.workers):
+            raise ValueError(
+                f"The provided worker_index {worker_index} is "
+                f"not valid for {self.num_workers} workers."
+            )
+        return (
+            self.workers[worker_index]
+            .actor._RayTrainWorker__execute.options(
+                name=f"_RayTrainWorker__execute.{func.__name__}"
+            )
+            .remote(func, *args, **kwargs)
+        )
+    def execute_single(
+        self, worker_index: int, func: Callable[..., T], *args, **kwargs
+    ) -> T:
+        """Execute ``func`` on worker with index ``worker_index``.
+        Args:
+            worker_index: The index to execute func on.
+            func: A function to call on the first worker.
+            args, kwargs: Passed directly into func.
+        Returns:
+            (T) The output of func.
+        """
+        return ray.get(self.execute_single_async(worker_index, func, *args, **kwargs))
+    def remove_workers(self, worker_indexes: List[int]):
+        """Removes the workers with the specified indexes.
+        The removed workers will go out of scope and their actor processes
+        will be terminated.
+        Args:
+            worker_indexes (List[int]): The indexes of the workers to remove.
+        """
+        new_workers = []
+        for i in range(len(self.workers)):
+            if i not in worker_indexes:
+                new_workers.append(self.workers[i])
+        self.workers = new_workers
+    def add_workers(self, num_workers: int):
+        """Adds ``num_workers`` to this WorkerGroup.
+        Note: Adding workers when the cluster/placement group is at capacity
+        may lead to undefined hanging behavior. If you are attempting to
+        replace existing workers in the WorkerGroup, remove_workers() should
+        be called first.
+        Args:
+            num_workers: The number of workers to add.
+        """
+        new_actors = []
+        new_actor_metadata = []
+        for _ in range(num_workers):
+            actor = self._remote_cls.options(
+                placement_group=self._placement_group
+            ).remote(*self._actor_cls_args, **self._actor_cls_kwargs)
+            new_actors.append(actor)
+            new_actor_metadata.append(
+                actor._RayTrainWorker__execute.options(
+                    name="_RayTrainWorker__execute.construct_metadata"
+                ).remote(construct_metadata)
+            )
+        # Get metadata from all actors.
+        metadata = ray.get(new_actor_metadata)
+        for i in range(len(new_actors)):
+            self.workers.append(Worker(actor=new_actors[i], metadata=metadata[i]))
+    def sort_workers_by_node_id_and_gpu_id(self, _first_node_id: Optional[str] = None):
+        """Reorder the workers by their node id and the lowest GPU id.
+        This is useful for collocating workers on the same node.
+        Example:
+            Given workers with the following attributes:
+                worker_0: node_id=1, gpu_ids=[1]
+                worker_1: node_id=0, gpu_ids=[0]
+                worker_2: node_id=1, gpu_ids=[0]
+                worker_3: node_id=0, gpu_ids=[1]
+            The function will perform the following steps:
+                1. Group by node ID:
+                    node_id=0: worker_1, worker_3
+                    node_id=1: worker_0, worker_2
+                2. Sort each group by GPU ID:
+                    node_id=0: worker_1 (gpu_id=0), worker_3 (gpu_id=1)
+                    node_id=1: worker_2 (gpu_id=0), worker_0 (gpu_id=1)
+            Resulting in the order: [worker_1, worker_3, worker_2, worker_0]
+        Args:
+            _first_node_id: The first ID to group by.
+                Set this to the node ID of the trainer coordinator to ensure that the
+                rank 0 worker is on the same node, allowing additional resources to
+                be specified for rank 0 workers via
+                `ScalingConfig(trainer_resources=)`.
+        """
+        node_id_to_workers = defaultdict(list)
+        if _first_node_id is not None:
+            node_id_to_workers[_first_node_id] = []
+        for worker in self.workers:
+            node_id_to_workers[worker.metadata.node_id].append(worker)
+        # Sort workers on the same node by the lowest GPU id
+        # More details: https://github.com/ray-project/ray/issues/40803
+        def get_lowest_gpu_id(worker) -> int:
+            gpu_ids = worker.metadata.resource_ids.get("GPU", [])
+            # If there are no GPU IDs, return 0 as a default
+            if not gpu_ids:
+                return 0
+            # Attempt to convert GPU IDs to integers and find the minimum ID.
+            # Fallback to return the minimum string-based ID
+            try:
+                return min(int(gpu_id) for gpu_id in gpu_ids)
+            except ValueError:
+                return min(gpu_ids)
+        for node_id in node_id_to_workers:
+            node_id_to_workers[node_id].sort(key=get_lowest_gpu_id)
+        sorted_workers = []
+        for workers in node_id_to_workers.values():
+            sorted_workers.extend(workers)
+        self.workers = sorted_workers
+    def __len__(self):
+        return len(self.workers)

.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# isort: off
+try:
+    import horovod  # noqa: F401
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "Horovod isn't installed. To install Horovod with PyTorch support, run 'pip "
+        "install 'horovod[pytorch]''. To install Horovod with TensorFlow support, "
+        "run 'pip install 'horovod[tensorflow]''."
+    )
+# isort: on
+from ray.train.horovod.config import HorovodConfig
+from ray.train.horovod.horovod_trainer import HorovodTrainer
+from ray.train.v2._internal.constants import is_v2_enabled
+if is_v2_enabled():
+    from ray.train.v2.horovod.horovod_trainer import HorovodTrainer  # noqa: F811
+__all__ = ["HorovodConfig", "HorovodTrainer"]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (930 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (9.6 kB). View file

.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc ADDED Viewed

Binary file (8.93 kB). View file

.venv/lib/python3.11/site-packages/ray/train/horovod/config.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+from dataclasses import dataclass
+from typing import Optional, Set
+from horovod.ray.runner import Coordinator
+from horovod.ray.utils import detect_nics, nics_to_env_var
+from horovod.runner.common.util import secret, timeout
+import ray
+from ray.train._internal.utils import update_env_vars
+from ray.train._internal.worker_group import Worker, WorkerGroup
+from ray.train.backend import Backend, BackendConfig
+from ray.util import PublicAPI
+@PublicAPI(stability="beta")
+@dataclass
+class HorovodConfig(BackendConfig):
+    """Configurations for Horovod setup.
+    See https://github.com/horovod/horovod/blob/master/horovod/runner/common/util/settings.py # noqa: E501
+    Args:
+        nics (Optional[Set[str]): Network interfaces that can be used for
+            communication.
+        verbose: Horovod logging verbosity.
+        key (Optional[str]): Secret used for communication between workers.
+        ssh_port (Optional[int]): Port for SSH server running on worker nodes.
+        ssh_identity_file (Optional[str]): Path to the identity file to
+            ssh into different hosts on the cluster.
+        ssh_str (Optional[str]): CAUTION WHEN USING THIS. Private key
+            file contents. Writes the private key to ssh_identity_file.
+        timeout_s: Timeout parameter for Gloo rendezvous.
+        placement_group_timeout_s: Timeout parameter for Ray
+            Placement Group creation. Currently unused.
+    """
+    nics: Optional[Set[str]] = None
+    verbose: int = 1
+    key: Optional[str] = None
+    ssh_port: Optional[int] = None
+    ssh_identity_file: Optional[str] = None
+    ssh_str: Optional[str] = None
+    timeout_s: int = 300
+    placement_group_timeout_s: int = 100
+    @property
+    def start_timeout(self):
+        return timeout.Timeout(
+            self.timeout_s,
+            message="Timed out waiting for {activity}. Please "
+            "check connectivity between servers. You "
+            "may need to increase the --start-timeout "
+            "parameter if you have too many servers.",
+        )
+    def __post_init__(self):
+        if self.ssh_str and not os.path.exists(self.ssh_identity_file):
+            with open(self.ssh_identity_file, "w") as f:
+                os.chmod(self.ssh_identity_file, 0o600)
+                f.write(self.ssh_str)
+        if self.key is None:
+            self.key = secret.make_secret_key()
+    @property
+    def backend_cls(self):
+        return _HorovodBackend
+class _HorovodBackend(Backend):
+    share_cuda_visible_devices: bool = True
+    def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig):
+        # TODO(matt): Implement placement group strategies in BackendExecutor.
+        # Initialize workers with Horovod environment variables
+        setup_futures = []
+        for rank in range(len(worker_group)):
+            worker_node_id = worker_group.workers[rank].metadata.node_id
+            setup_futures.append(
+                worker_group.execute_single_async(
+                    rank,
+                    _init_env_vars,
+                    rank,
+                    len(worker_group),
+                    worker_node_id,
+                )
+            )
+        ray.get(setup_futures)
+        # Use Horovod Ray Coordinator
+        # backend_config as settings
+        self.coordinator = Coordinator(backend_config)
+        # Get all the hostnames of all workers
+        node_ids = [w.metadata.node_id for w in worker_group.workers]
+        hostnames = [w.metadata.hostname for w in worker_group.workers]
+        # Register each hostname to the coordinator. assumes the hostname
+        # ordering is the same.
+        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
+            self.coordinator.register(hostname, node_id, rank)
+        all_info = self.coordinator.finalize_registration()
+        setup_futures = []
+        for rank, local_cross_env_var in all_info.items():
+            setup_futures.append(
+                worker_group.execute_single_async(
+                    rank, update_env_vars, local_cross_env_var
+                )
+            )
+        ray.get(setup_futures)
+        coordinator_envs = self.coordinator.establish_rendezvous()
+        # Get one worker from each host/node.
+        node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)]
+        node_workers = [
+            _HorovodWorkerWrapper(worker_group.workers[worker_index])
+            for worker_index in node_worker_indexes
+        ]
+        assert len(node_workers) == len(self.coordinator.hostnames)
+        nics = detect_nics(
+            backend_config,
+            all_host_names=list(self.coordinator.hostnames),
+            node_workers=node_workers,
+        )
+        coordinator_envs.update(nics_to_env_var(nics))
+        worker_group.execute(update_env_vars, coordinator_envs)
+def _init_env_vars(world_rank: int, world_size: int, node_id: str):
+    """Initialize Horovod environment variables."""
+    os.environ["HOROVOD_HOSTNAME"] = node_id
+    os.environ["HOROVOD_RANK"] = str(world_rank)
+    os.environ["HOROVOD_SIZE"] = str(world_size)
+# TODO(tgaddair): temporary workaround for Horovod's worker discovery logic,
+#  which requires passing in an extra parameter as part of the RayExecutor
+#  API. This will be removed in the future as we migrate more of the
+#  RayExecutor utils into Ray Train.
+#  See: https://github.com/horovod/horovod/blob/v0.23.0/horovod/ray/driver_service.py#L9 # noqa: E501
+@dataclass
+class _HorovodWorkerWrapper:
+    w: Worker
+    @property
+    def execute(self):
+        w = self.w
+        class ExecuteHandle:
+            def remote(self, func, *args, **kwargs):
+                _ = None
+                return w.actor._RayTrainWorker__execute.remote(func, _, *args, **kwargs)
+        return ExecuteHandle()

.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py ADDED Viewed

	@@ -0,0 +1,202 @@

+from typing import Any, Callable, Dict, Optional, Union
+from ray.air.config import RunConfig, ScalingConfig
+from ray.train import Checkpoint, DataConfig
+from ray.train.data_parallel_trainer import DataParallelTrainer
+from ray.train.horovod.config import HorovodConfig
+from ray.train.trainer import GenDataset
+from ray.util.annotations import PublicAPI
+@PublicAPI(stability="beta")
+class HorovodTrainer(DataParallelTrainer):
+    """A Trainer for data parallel Horovod training.
+    This Trainer runs the function ``train_loop_per_worker`` on multiple Ray
+    Actors. These actors already have the necessary Horovod setup already
+    configured for distributed Horovod training.
+    The ``train_loop_per_worker`` function is expected to take in either 0 or 1
+    arguments:
+    .. testcode::
+        def train_loop_per_worker():
+            ...
+    .. testcode::
+        def train_loop_per_worker(config: Dict):
+            ...
+    If ``train_loop_per_worker`` accepts an argument, then
+    ``train_loop_config`` will be passed in as the argument. This is useful if you
+    want to tune the values in ``train_loop_config`` as hyperparameters.
+    If the ``datasets`` dict contains a training dataset (denoted by
+    the "train" key), then it will be split into multiple dataset
+    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
+    ``train_loop_per_worker``. All the other datasets will not be split and
+    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.
+    Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray Train loop methods <train-loop-api>`.
+    .. testcode::
+        from ray import train
+        def train_loop_per_worker():
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            train.report(...)
+            # Returns dict of last saved checkpoint.
+            train.get_checkpoint()
+            # Returns the Dataset shard for the given key.
+            train.get_dataset_shard("my_dataset")
+            # Returns the total number of workers executing training.
+            train.get_context().get_world_size()
+            # Returns the rank of this worker.
+            train.get_context().get_world_rank()
+            # Returns the rank of the worker on the current node.
+            train.get_context().get_local_rank()
+    Any returns from the ``train_loop_per_worker`` will be discarded and not
+    used or persisted anywhere.
+    You could use ``TensorflowPredictor`` or ``TorchPredictor`` in conjunction with
+    HorovodTrainer. You must save the model under the "model" kwarg in the
+    ``Checkpoint`` passed to ``train.report()``, so that it can be used by
+    corresponding predictors.
+    Example:
+    .. testcode::
+        :skipif: True
+        import os
+        import tempfile
+        import ray
+        import horovod.torch as hvd
+        import torch
+        import torch.nn as nn
+        from ray import train
+        import ray.train.torch  # Need this to use `train.torch.get_device()`
+        from ray.train import Checkpoint, ScalingConfig
+        from ray.train.horovod import HorovodTrainer
+        # If using GPUs, set this to True.
+        use_gpu = False
+        input_size = 1
+        layer_size = 15
+        output_size = 1
+        num_epochs = 3
+        class NeuralNetwork(nn.Module):
+            def __init__(self):
+                super(NeuralNetwork, self).__init__()
+                self.layer1 = nn.Linear(input_size, layer_size)
+                self.relu = nn.ReLU()
+                self.layer2 = nn.Linear(layer_size, output_size)
+            def forward(self, input):
+                return self.layer2(self.relu(self.layer1(input)))
+        def train_loop_per_worker():
+            hvd.init()
+            dataset_shard = train.get_dataset_shard("train")
+            model = NeuralNetwork()
+            device = train.torch.get_device()
+            model.to(device)
+            loss_fn = nn.MSELoss()
+            lr_scaler = 1
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.1 * lr_scaler)
+            # Horovod: wrap optimizer with DistributedOptimizer.
+            optimizer = hvd.DistributedOptimizer(
+                optimizer,
+                named_parameters=model.named_parameters(),
+                op=hvd.Average,
+            )
+            for epoch in range(num_epochs):
+                model.train()
+                for batch in dataset_shard.iter_torch_batches(
+                    batch_size=32, dtypes=torch.float
+                ):
+                    inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"]
+                    outputs = model(inputs)
+                    loss = loss_fn(outputs, labels)
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+                    print(f"epoch: {epoch}, loss: {loss.item()}")
+                # Save a model checkpoint at the end of each epoch
+                with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                    ckpt_path = os.path.join(temp_checkpoint_dir, "model.pt")
+                    torch.save(model.state_dict(), ckpt_path)
+                    train.report(
+                        {"loss": loss.item(), "epoch": epoch},
+                        checkpoint=Checkpoint.from_directory(temp_checkpoint_dir),
+                    )
+        train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+        scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
+        trainer = HorovodTrainer(
+            train_loop_per_worker=train_loop_per_worker,
+            scaling_config=scaling_config,
+            datasets={"train": train_dataset},
+        )
+        result = trainer.fit()
+    Args:
+        train_loop_per_worker: The training function to execute.
+            This can either take in no arguments or a ``config`` dict.
+        train_loop_config: Configurations to pass into
+            ``train_loop_per_worker`` if it accepts an argument.
+        horovod_config: Configuration for setting up the Horovod backend.
+            If set to None, use the default configuration. This replaces the
+            ``backend_config`` arg of ``DataParallelTrainer``.
+        scaling_config: Configuration for how to scale data parallel training.
+        dataset_config: Configuration for dataset ingest.
+        run_config: Configuration for the execution of the training run.
+        datasets: Any Datasets to use for training. Use
+            the key "train" to denote which dataset is the training
+            dataset.
+        resume_from_checkpoint: A checkpoint to resume training from.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        horovod_config: Optional[HorovodConfig] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        dataset_config: Optional[DataConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        super().__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=horovod_config or HorovodConfig(),
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )

.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# isort: off
+try:
+    import lightning  # noqa: F401
+except ModuleNotFoundError:
+    try:
+        import pytorch_lightning  # noqa: F401
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "PyTorch Lightning isn't installed. To install PyTorch Lightning, "
+            "please run 'pip install lightning'"
+        )
+# isort: on
+from ray.train.lightning._lightning_utils import (
+    RayDDPStrategy,
+    RayDeepSpeedStrategy,
+    RayFSDPStrategy,
+    RayLightningEnvironment,
+    RayTrainReportCallback,
+    prepare_trainer,
+)
+from ray.train.v2._internal.constants import is_v2_enabled
+if is_v2_enabled():
+    from ray.train.v2.lightning.lightning_utils import (  # noqa: F811
+        RayTrainReportCallback,
+    )
+__all__ = [
+    "prepare_trainer",
+    "RayDDPStrategy",
+    "RayFSDPStrategy",
+    "RayDeepSpeedStrategy",
+    "RayLightningEnvironment",
+    "RayTrainReportCallback",
+]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.16 kB). View file