koichi12 commited on Feb 12, 2025

Commit

ec4d14c

verified ·

1 Parent(s): d5967d1

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/train/v2/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__pycache__/exceptions.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__pycache__/util.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__init__.py +14 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/accelerators.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/backend_setup.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/datasets.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/metrics.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/user_callback.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/working_dir_setup.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/accelerators.py +151 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/backend_setup.py +27 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/datasets.py +76 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/metrics.py +250 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/user_callback.py +50 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/working_dir_setup.py +24 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/constants.py +84 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/exceptions.py +170 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/callback.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/context.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/controller.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/storage.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/callback.py +140 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/checkpoint_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/report_handler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/sync_actor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py +271 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/report_handler.py +111 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/sync_actor.py +190 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/context.py +281 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/controller.py +377 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/default.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/factory.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/failure_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/default.py +44 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/factory.py +13 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__init__.py +19 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__pycache__/factory.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__pycache__/fixed.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__pycache__/scaling_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/factory.py +13 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/fixed.py +22 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/scaling_policy.py +51 -0
.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/storage.py +551 -0

.venv/lib/python3.11/site-packages/ray/train/v2/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (185 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (195 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__pycache__/exceptions.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (8.22 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .accelerators import AcceleratorSetupCallback
+from .backend_setup import BackendSetupCallback
+from .datasets import DatasetsSetupCallback
+from .working_dir_setup import WorkingDirectorySetupCallback
+__all__ = [
+    "AcceleratorSetupCallback",
+    "BackendSetupCallback",
+    "DatasetsSetupCallback",
+    "WorkingDirectorySetupCallback",
+]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/accelerators.cpython-311.pyc ADDED Viewed

Binary file (7.69 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/backend_setup.cpython-311.pyc ADDED Viewed

Binary file (2.26 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/datasets.cpython-311.pyc ADDED Viewed

Binary file (4.62 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/metrics.cpython-311.pyc ADDED Viewed

Binary file (14.1 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/user_callback.cpython-311.pyc ADDED Viewed

Binary file (2.52 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/__pycache__/working_dir_setup.cpython-311.pyc ADDED Viewed

Binary file (1.93 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/accelerators.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import logging
+import os
+from collections import defaultdict
+from typing import List
+import ray._private.ray_constants as ray_constants
+from ray._private.ray_constants import env_bool
+from ray.train import BackendConfig
+from ray.train.constants import ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV
+from ray.train.v2._internal.execution.callback import WorkerGroupCallback
+from ray.train.v2._internal.execution.worker_group import ActorMetadata, WorkerGroup
+from ray.train.v2._internal.util import ray_get_safe
+from ray.train.v2.api.config import ScalingConfig
+logger = logging.getLogger(__name__)
+class AcceleratorSetupCallback(WorkerGroupCallback):
+    """Perform accelerator setup for workers.
+    For example, this callback can be used to share CUDA_VISIBLE_DEVICES
+    among workers on the same node.
+    """
+    def __init__(self, backend_config: BackendConfig, scaling_config: ScalingConfig):
+        self._backend = backend_config.backend_cls()
+        self._scaling_config = scaling_config
+    def after_worker_group_start(self, worker_group: WorkerGroup):
+        self._maybe_share_cuda_visible_devices(worker_group)
+        # TODO: Add support for sharing other accelerator resources.
+    def _maybe_share_cuda_visible_devices(self, worker_group: WorkerGroup):
+        share_cuda_visible_devices_enabled = env_bool(
+            ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
+            self._backend.share_cuda_visible_devices,
+        )
+        if (
+            self._scaling_config._resources_per_worker_not_none.get("GPU", 0) > 0
+            and share_cuda_visible_devices_enabled
+        ):
+            _share_cuda_visible_devices(worker_group)
+def _share_cuda_visible_devices(worker_group: WorkerGroup):
+    """Sets CUDA_VISIBLE_DEVICES on all workers.
+    For each worker, CUDA_VISIBLE_DEVICES will be set to the GPU IDs
+    visible to all workers on that worker's node.
+    This allows GPU workers on the same node to communicate with one
+    another.
+    Example:
+        Setup:
+        - Node1:
+            - Worker1: {0, 1}
+            - Worker2: {2, 3}
+        - Node2:
+            - Worker3: {0, 1}
+        CUDA_VISIBLE_DEVICES:
+        - Worker1: "0,1,2,3"
+        - Worker2: "0,1,2,3"
+        - Worker2: "0,1"
+    """
+    _share_accelerator_ids(
+        worker_group, ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR
+    )
+def _share_accelerator_ids(
+    worker_group: WorkerGroup, accelerator_name: str, env_var: str
+):
+    """Sets the given env_var on all workers.
+    For each worker, the cores/devices are visible to all the
+    workers on that worker's node. This allows workers on the
+    same node to communicate with one another.
+    Example:
+        Setup:
+        - Node1:
+            - Worker1: {0, 1}
+            - Worker2: {2, 3}
+        - Node2:
+            - Worker3: {0, 1}
+        NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/...:
+        - Worker1: "0,1,2,3"
+        - Worker2: "0,1,2,3"
+        - Worker2: "0,1"
+    Args:
+        accelerator_name: The name of the accelerator.
+        env_var: The name of the environment variable to set.
+    """
+    if not worker_group.has_started():
+        raise RuntimeError(
+            "WorkerGroup must be started before sharing accelerator IDs."
+        )
+    worker_metadatas = [worker.metadata for worker in worker_group.get_workers()]
+    visible_accelerator_ids_per_worker = _get_visible_accelerator_ids_per_worker(
+        worker_metadatas=worker_metadatas, accelerator_name=accelerator_name
+    )
+    def set_accelerator_ids(accelerator_ids):
+        os.environ[env_var] = accelerator_ids
+    futures = []
+    for rank, visible_accelerator_ids in enumerate(visible_accelerator_ids_per_worker):
+        futures.append(
+            worker_group.execute_single_async(
+                rank, set_accelerator_ids, accelerator_ids=visible_accelerator_ids
+            )
+        )
+    ray_get_safe(futures)
+def _get_visible_accelerator_ids_per_worker(
+    worker_metadatas: List[ActorMetadata], accelerator_name: str
+) -> List[str]:
+    """Returns a list of comma-separated accelerator IDs visible to each worker.
+    All workers on a node should have the same set of visible accelerators,
+    which is the union of accelerator ids of the workers.
+    Returns:
+        visible_accelerator_ids_per_worker: A list of comma-separated accelerator ID
+            strings. This list is the same length as the number of workers.
+    """
+    for metadata in worker_metadatas:
+        if accelerator_name not in metadata.accelerator_ids:
+            raise ValueError(
+                f"Accelerator '{accelerator_name}' is not available on all workers. "
+                f"Got these available accelerators instead: {metadata.accelerator_ids}"
+            )
+    node_id_to_accelerator_ids = defaultdict(set)
+    for metadata in worker_metadatas:
+        node_id_to_accelerator_ids[metadata.node_id].update(
+            metadata.accelerator_ids[accelerator_name]
+        )
+    visible_accelerator_ids_per_worker = []
+    for worker_id in range(len(worker_metadatas)):
+        node_id = worker_metadatas[worker_id].node_id
+        accelerator_ids = sorted(node_id_to_accelerator_ids[node_id])
+        all_resource_ids = ",".join([str(id) for id in accelerator_ids])
+        visible_accelerator_ids_per_worker.append(all_resource_ids)
+    return visible_accelerator_ids_per_worker

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/backend_setup.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import logging
+from ray.exceptions import RayActorError
+from ray.train.backend import BackendConfig
+from ray.train.v2._internal.execution.callback import WorkerGroupCallback
+from ray.train.v2._internal.execution.worker_group import WorkerGroup
+logger = logging.getLogger(__name__)
+class BackendSetupCallback(WorkerGroupCallback):
+    def __init__(self, backend_config: BackendConfig):
+        self._backend_config = backend_config
+        self._backend = backend_config.backend_cls()
+    def after_worker_group_start(self, worker_group: WorkerGroup):
+        self._backend.on_start(worker_group, self._backend_config)
+        self._backend.on_training_start(worker_group, self._backend_config)
+    def before_worker_group_shutdown(self, worker_group: WorkerGroup):
+        try:
+            self._backend.on_shutdown(worker_group, self._backend_config)
+        except RayActorError:
+            logger.warning(
+                "Graceful shutdown of backend failed. This is "
+                "expected if one of the workers has crashed."
+            )

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/datasets.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import copy
+from typing import Any, Callable, Dict, List, Union
+import ray.train
+from ray.data import Dataset
+from ray.data.context import DataContext
+from ray.train.v2._internal.execution.callback import WorkerGroupCallback
+from ray.train.v2._internal.execution.worker_group.worker_group import WorkerGroup
+# A type representing either a ray.data.Dataset or a function that returns a
+# ray.data.Dataset and accepts no arguments.
+GenDataset = Union[Dataset, Callable[[], Dataset]]
+class DatasetsSetupCallback(WorkerGroupCallback):
+    """The callback to setup Ray Datasets for the worker group."""
+    def __init__(
+        self,
+        datasets: Dict[str, GenDataset],
+        data_config: ray.train.DataConfig,
+        scaling_config: ray.train.ScalingConfig,
+    ):
+        self._datasets = datasets
+        self._data_config = data_config
+        self._scaling_config = scaling_config
+        # Capture the current DataContext to propagate it to
+        # the Train workers later.
+        # The propagation works in the following way:
+        # 1. This callback is created when user create the Trainer.
+        # 2. Then this callback will be passed to the Controller actor.
+        # 3. Lastly, when the worker group is initialized, the Controller
+        #    will call the `after_worker_group_start` callback to propagate
+        #    the DataContext to Train workers.
+        self._data_context = copy.deepcopy(DataContext.get_current())
+    def get_train_total_resources(
+        self, scaling_config: ray.train.ScalingConfig
+    ) -> Dict[str, float]:
+        """Return the resources reserved for training, so that Data can exclude
+        these resources logically from its available pool."""
+        return scaling_config.total_resources
+    def before_init_train_context(
+        self, worker_group: "WorkerGroup"
+    ) -> Dict[str, List[Any]]:
+        # Configure dataset shards
+        datasets = {k: v() if callable(v) else v for k, v in self._datasets.items()}
+        node_ids = [worker.metadata.node_id for worker in worker_group.get_workers()]
+        # Notify the DataConfig about the total resources reserved for training.
+        total_train_resources = self.get_train_total_resources(self._scaling_config)
+        self._data_config.set_train_total_resources(
+            total_train_resources.get("CPU", 0), total_train_resources.get("GPU", 0)
+        )
+        dataset_shards = self._data_config.configure(
+            datasets,
+            world_size=len(worker_group),
+            worker_handles=None,
+            worker_node_ids=node_ids,
+        )
+        assert len(dataset_shards) == len(worker_group)
+        return {"dataset_shards": dataset_shards}
+    def after_worker_group_start(self, worker_group: "WorkerGroup"):
+        # Propagate DataContext
+        def _propagate_data_context(ctx: DataContext):
+            DataContext._set_current(ctx)
+        worker_group.execute(
+            _propagate_data_context,
+            self._data_context,
+        )

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/metrics.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import threading
+import time
+from contextlib import contextmanager
+from dataclasses import asdict, dataclass, field, fields
+from typing import Dict, Optional
+from ray.train.v2._internal.execution.callback import (
+    ControllerCallback,
+    TrainContextCallback,
+    WorkerCallback,
+    WorkerGroupCallback,
+)
+from ray.train.v2._internal.execution.context import TrainRunContext, get_train_context
+from ray.train.v2._internal.util import time_monotonic
+from ray.util.metrics import Gauge
+# Prometheus Tag keys for the worker and controller metrics.
+RUN_NAME_TAG_KEY = "ray_train_run_name"
+WORKER_WORLD_RANK_TAG_KEY = "ray_train_worker_world_rank"
+@dataclass
+class ControllerMetrics:
+    """A list of Train controller metrics.
+    Metric metadata attributes:
+    - description (required): A human-readable description of the metric, also used as
+        the chart description on the Ray Train dashboard.
+    """
+    train_worker_group_start_total_time_s: float = field(
+        default=0.0,
+        metadata={
+            "description": (
+                "Cumulative time in seconds to start worker groups in the Train job."
+            ),
+        },
+    )
+    train_worker_group_shutdown_total_time_s: float = field(
+        default=0.0,
+        metadata={
+            "description": (
+                "Cumulative time in seconds to shutdown worker groups in the Train job."
+            ),
+        },
+    )
+@dataclass
+class WorkerMetrics:
+    """A list of Train worker metrics.
+    Metric metadata attributes:
+    - description (required): A human-readable description of the metric, also used as
+        the chart description on the Ray Train dashboard.
+    """
+    train_report_total_blocked_time_s: float = field(
+        default=0.0,
+        metadata={
+            "description": (
+                "Cumulative time in seconds to report a checkpoint to the storage."
+            ),
+        },
+    )
+class ControllerMetricsCallback(ControllerCallback, WorkerGroupCallback):
+    # Interval for pushing metrics to Prometheus.
+    LOCAL_METRICS_PUSH_INTERVAL_S: float = 5.0
+    CONTROLLER_TAG_KEYS = (RUN_NAME_TAG_KEY,)
+    def __init__(self, train_run_context: TrainRunContext):
+        """
+        This callback is initialized on the driver process and then passed to the
+        controller. This callback collects metrics from the controller actor as well
+        as the metrics related to the worker groups.
+        """
+        self._run_name = train_run_context.get_run_config().name
+        self._thread: Optional[threading.Thread] = None
+        self._thread_stop_event: Optional[threading.Event] = None
+        self._metrics: Optional[ControllerMetrics] = None
+        self._metrics_lock: Optional[threading.Lock] = None
+        self._controller_tag: Dict[str, str] = {}
+        self._metrics_gauges: Dict[str, Gauge] = {}
+    def _create_prometheus_controller_metrics(self) -> Dict[str, Gauge]:
+        """Create Prometheus worker metrics for the ControllerMetrics dataclass."""
+        metrics = {}
+        for _field in fields(ControllerMetrics):
+            metric_description = _field.metadata.get("description")
+            metrics[_field.name] = Gauge(
+                _field.name,
+                description=metric_description,
+                tag_keys=self.CONTROLLER_TAG_KEYS,
+            )
+        return metrics
+    def after_controller_start(self):
+        """
+        Creating a thread to periodically push local metrics to the gauges
+        after the train controller starts.
+        """
+        self._controller_tag = {
+            RUN_NAME_TAG_KEY: self._run_name,
+        }
+        self._thread_stop_event = threading.Event()
+        self._metrics_lock = threading.Lock()
+        self._metrics = ControllerMetrics()
+        self._metrics_gauges = self._create_prometheus_controller_metrics()
+        def push_local_metrics():
+            while not self._thread_stop_event.is_set():
+                with self._metrics_lock:
+                    metrics_dict = asdict(self._metrics)
+                for metric_name, metric_value in metrics_dict.items():
+                    self._metrics_gauges[metric_name].set(
+                        metric_value, self._controller_tag
+                    )
+                time.sleep(ControllerMetricsCallback.LOCAL_METRICS_PUSH_INTERVAL_S)
+        assert not self._thread
+        self._thread = threading.Thread(target=push_local_metrics, daemon=True)
+        self._thread.start()
+    def before_controller_shutdown(self):
+        """
+        Stop the thread that pushes local metrics to the gauges before the
+        controller shuts down.
+        """
+        # Stop the thread that pushes local metrics to the metrics gauges.
+        assert not self._thread_stop_event.is_set()
+        self._thread_stop_event.set()
+        # Reset the metrics to their default values.
+        for _field in fields(self._metrics):
+            self._metrics_gauges[_field.name].set(_field.default, self._controller_tag)
+    @contextmanager
+    def on_worker_group_start(self):
+        """
+        Context manager to measure the time taken to start a worker group.
+        """
+        start_time_s = time_monotonic()
+        yield
+        elapsed_time_s = time_monotonic() - start_time_s
+        with self._metrics_lock:
+            self._metrics.train_worker_group_start_total_time_s += elapsed_time_s
+    @contextmanager
+    def on_worker_group_shutdown(self):
+        """
+        Context manager to measure the time taken to start a worker group.
+        """
+        start_time_s = time_monotonic()
+        yield
+        elapsed_time_s = time_monotonic() - start_time_s
+        with self._metrics_lock:
+            self._metrics.train_worker_group_shutdown_total_time_s += elapsed_time_s
+class WorkerMetricsCallback(WorkerCallback, TrainContextCallback):
+    # Interval for pushing metrics to Prometheus.
+    LOCAL_METRICS_PUSH_INTERVAL_S: float = 5.0
+    WORKER_TAG_KEYS = (RUN_NAME_TAG_KEY, WORKER_WORLD_RANK_TAG_KEY)
+    def __init__(self, train_run_context: TrainRunContext):
+        """
+        This callback is initialized on the driver process and then passed to the
+        workers. When adding more class attributes, make sure the attributes are
+        serializable picklable.
+        TODO: Making Callbacks factory methods that when they are initialized on the
+        driver process, we do not need to worry about pickling the callback instances.
+        """
+        self._run_name = train_run_context.get_run_config().name
+        self._thread: Optional[threading.Thread] = None
+        self._thread_stop_event: Optional[threading.Event] = None
+        self._metrics_lock: Optional[threading.Lock] = None
+        self._metrics: Optional[WorkerMetrics] = None
+        self._worker_tag: Dict[str, str] = {}
+        self._metrics_gauges: Dict[str, Gauge] = {}
+    def _create_prometheus_worker_metrics(self) -> Dict[str, Gauge]:
+        """Create Prometheus worker metrics for the TrainMetrics dataclass."""
+        metrics = {}
+        for _field in fields(self._metrics):
+            metric_description = _field.metadata.get("description")
+            metrics[_field.name] = Gauge(
+                _field.name,
+                description=metric_description,
+                tag_keys=self.WORKER_TAG_KEYS,
+            )
+        return metrics
+    def after_init_train_context(self):
+        """
+        Creating a thread to periodically push local metrics to the gauges
+        after the train context is initialized.
+        Note:
+            This method should be called after the train context is initialized on
+            each of the worker. The thread should not be created in the `__init__`
+            method which is called on the train driver process.
+        """
+        self._worker_tag = {
+            RUN_NAME_TAG_KEY: self._run_name,
+            WORKER_WORLD_RANK_TAG_KEY: str(get_train_context().get_world_rank()),
+        }
+        self._thread_stop_event = threading.Event()
+        self._metrics_lock = threading.Lock()
+        self._metrics = WorkerMetrics()
+        self._metrics_gauges = self._create_prometheus_worker_metrics()
+        def push_local_metrics():
+            while not self._thread_stop_event.is_set():
+                with self._metrics_lock:
+                    metrics_dict = asdict(self._metrics)
+                for metric_name, metric_value in metrics_dict.items():
+                    self._metrics_gauges[metric_name].set(
+                        metric_value, self._worker_tag
+                    )
+                time.sleep(WorkerMetricsCallback.LOCAL_METRICS_PUSH_INTERVAL_S)
+        assert not self._thread
+        self._thread = threading.Thread(target=push_local_metrics, daemon=True)
+        self._thread.start()
+    def before_worker_shutdown(self):
+        """
+        Stop the thread that pushes local metrics to the metrics gauges before
+        the worker group shuts down.
+        """
+        # Stop the thread that pushes local metrics to the gauges.
+        assert not self._thread_stop_event.is_set()
+        self._thread_stop_event.set()
+        # Reset the metrics to their default values.
+        for _field in fields(self._metrics):
+            self._metrics_gauges[_field.name].set(_field.default, self._worker_tag)
+    @contextmanager
+    def on_report(self):
+        """
+        Context manager to measure the time taken to report a checkpoint to the storage.
+        """
+        start_time_s = time_monotonic()
+        yield
+        elapsed_time_s = time_monotonic() - start_time_s
+        with self._metrics_lock:
+            self._metrics.train_report_total_blocked_time_s += elapsed_time_s

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/user_callback.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Any, Dict, List, Optional
+from ray.train import Checkpoint
+from ray.train.v2._internal.execution.callback import (
+    ReportCallback,
+    WorkerGroupCallback,
+)
+from ray.train.v2._internal.execution.context import TrainRunContext
+from ray.train.v2._internal.execution.worker_group import WorkerGroupStatus
+from ray.train.v2.api.callback import UserCallback
+class UserCallbackHandler(WorkerGroupCallback, ReportCallback):
+    """Responsible for calling methods of subscribers implementing
+    the `UserCallback` interface.
+    """
+    def __init__(
+        self, user_callbacks: List[UserCallback], train_run_context: TrainRunContext
+    ):
+        self._user_callbacks = user_callbacks
+        self._train_run_context = train_run_context
+    # --------------------------
+    # ReportCallback
+    # --------------------------
+    def after_report(
+        self, metrics: List[Dict[str, Any]], checkpoint: Optional[Checkpoint]
+    ):
+        for user_callback in self._user_callbacks:
+            user_callback.after_report(
+                run_context=self._train_run_context,
+                metrics=metrics,
+                checkpoint=checkpoint,
+            )
+    # --------------------------
+    # WorkerGroupCallback
+    # --------------------------
+    def after_worker_group_poll_status(self, worker_group_status: WorkerGroupStatus):
+        if not worker_group_status.errors:
+            return
+        for user_callback in self._user_callbacks:
+            user_callback.after_exception(
+                run_context=self._train_run_context,
+                worker_exceptions=worker_group_status.errors,
+            )

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/callbacks/working_dir_setup.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import logging
+import os
+from ray.train.v2._internal.execution.callback import WorkerGroupCallback
+from ray.train.v2._internal.execution.context import get_train_context
+from ray.train.v2._internal.execution.worker_group import WorkerGroup
+logger = logging.getLogger(__name__)
+class WorkingDirectorySetupCallback(WorkerGroupCallback):
+    def after_worker_group_start(self, worker_group: WorkerGroup):
+        def chdir_to_working_dir() -> None:
+            """Create the local working directory for the experiment."""
+            local_working_directory = (
+                get_train_context().get_storage().local_working_directory
+            )
+            os.makedirs(local_working_directory, exist_ok=True)
+            logger.debug(
+                f"Changing the working directory to: {local_working_directory}"
+            )
+            os.chdir(local_working_directory)
+        worker_group.execute(chdir_to_working_dir)

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/constants.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+from typing import Dict
+from ray._private.ray_constants import env_bool, env_set_by_user
+# Unsupported configs can use this value to detect if the user has set it.
+_UNSUPPORTED = "UNSUPPORTED"
+_DEPRECATED = "DEPRECATED"
+# The name of the file that is used to validate the storage.
+VALIDATE_STORAGE_MARKER_FILENAME = ".validate_storage_marker"
+# The name of the file that is used to store the checkpoint manager snapshot.
+CHECKPOINT_MANAGER_SNAPSHOT_FILENAME = "checkpoint_manager_snapshot.json"
+# =====================
+# Environment Variables
+# =====================
+# Polling interval for the Train controller.
+# This determines how many seconds the controller will wait between
+# polling the worker group for its status.
+HEALTH_CHECK_INTERVAL_S_ENV_VAR = "RAY_TRAIN_HEALTH_CHECK_INTERVAL_S"
+DEFAULT_HEALTH_CHECK_INTERVAL_S: float = 2.0
+# The time in seconds a worker health check must be hanging for
+# before the controller marks the worker as dead and handles the failure.
+WORKER_HEALTH_CHECK_TIMEOUT_S_ENV_VAR = "RAY_TRAIN_WORKER_HEALTH_CHECK_TIMEOUT_S"
+DEFAULT_WORKER_HEALTH_CHECK_TIMEOUT_S: float = 10 * 60
+# Timeout in seconds for the worker group to start.
+WORKER_GROUP_START_TIMEOUT_S_ENV_VAR = "RAY_TRAIN_WORKER_GROUP_START_TIMEOUT_S"
+DEFAULT_WORKER_GROUP_START_TIMEOUT_S: float = 30.0
+# Timeout in seconds for `ray.train.report` to block on synchronization barriers,
+# after which a timeout error will be raised.
+REPORT_BARRIER_TIMEOUT_S_ENV_VAR = "RAY_TRAIN_REPORT_BARRIER_TIMEOUT_S"
+DEFAULT_REPORT_BARRIER_TIMEOUT_S: float = 60 * 30
+# Time in seconds for `ray.train.report` to log a warning if it is waiting for sync
+# actor notification of releasing.
+REPORT_BARRIER_WARN_INTERVAL_S_ENV_VAR = "RAY_TRAIN_REPORT_BARRIER_WARN_INTERVAL_S"
+DEFAULT_REPORT_BARRIER_WARN_INTERVAL_S: float = 60
+# The environment variable to enable the Ray Train Metrics.
+METRICS_ENABLED_ENV_VAR = "RAY_TRAIN_METRICS_ENABLED"
+# Environment variable to enable the print function patching.
+ENABLE_PRINT_PATCH_ENV_VAR = "RAY_TRAIN_ENABLE_PRINT_PATCH"
+DEFAULT_ENABLE_PRINT_PATCH = "1"
+# Whether or not to run the controller as an actor.
+RUN_CONTROLLER_AS_ACTOR_ENV_VAR = "RAY_TRAIN_RUN_CONTROLLER_AS_ACTOR"
+DEFAULT_RUN_CONTROLLER_AS_ACTOR = "1"
+# V2 feature flag.
+V2_ENABLED_ENV_VAR = "RAY_TRAIN_V2_ENABLED"
+def is_v2_enabled() -> bool:
+    return env_bool(V2_ENABLED_ENV_VAR, False)
+ENV_VARS_TO_PROPAGATE = {
+    V2_ENABLED_ENV_VAR,
+    HEALTH_CHECK_INTERVAL_S_ENV_VAR,
+    WORKER_HEALTH_CHECK_TIMEOUT_S_ENV_VAR,
+    WORKER_GROUP_START_TIMEOUT_S_ENV_VAR,
+    ENABLE_PRINT_PATCH_ENV_VAR,
+}
+def get_env_vars_to_propagate() -> Dict[str, str]:
+    """Returns a dictionary of environment variables that should be propagated
+    from the driver to the controller, and then from the controller
+    to each training worker.
+    This way, users only need to set environment variables in one place
+    when launching the script instead of needing to manually set a runtime environment.
+    """
+    env_vars = {}
+    for env_var in ENV_VARS_TO_PROPAGATE:
+        if env_set_by_user(env_var):
+            env_vars[env_var] = os.environ[env_var]
+    return env_vars

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/exceptions.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+from typing import Dict, List, Optional
+from ray.train.v2._internal.constants import (
+    DEFAULT_WORKER_GROUP_START_TIMEOUT_S,
+    DEFAULT_WORKER_HEALTH_CHECK_TIMEOUT_S,
+    REPORT_BARRIER_TIMEOUT_S_ENV_VAR,
+    WORKER_GROUP_START_TIMEOUT_S_ENV_VAR,
+    WORKER_HEALTH_CHECK_TIMEOUT_S_ENV_VAR,
+)
+# TODO: Distinguish between user and system exceptions.
+class RayTrainError(Exception):
+    """Base class for all Ray Train exceptions."""
+class WorkerHealthCheckTimeoutError(RayTrainError):
+    """Exception raised when a worker health check hangs for long enough."""
+    def __init__(self, message):
+        timeout = os.getenv(
+            WORKER_HEALTH_CHECK_TIMEOUT_S_ENV_VAR, DEFAULT_WORKER_HEALTH_CHECK_TIMEOUT_S
+        )
+        message += (
+            f"\nSet the {WORKER_HEALTH_CHECK_TIMEOUT_S_ENV_VAR} "
+            "environment variable to increase the timeout "
+            f"(current value: {timeout} seconds)."
+        )
+        super().__init__(message)
+class WorkerHealthCheckFailedError(RayTrainError):
+    """Exception raised when a worker health check fails."""
+    def __init__(self, message, failure: Exception):
+        super().__init__(message)
+        self._message = message
+        self.health_check_failure = failure
+    def __reduce__(self):
+        return (self.__class__, (self._message, self.health_check_failure))
+class TrainingFailedError(RayTrainError):
+    """Exception raised when training fails."""
+    def __init__(self, worker_failures: Dict[int, Exception]):
+        super().__init__(
+            "Training failed due to worker errors. "
+            "Please inspect the error logs above, "
+            "or access the latest worker failures in this "
+            "exception's `worker_failures` attribute."
+        )
+        self.worker_failures = worker_failures
+    def __reduce__(self):
+        return (self.__class__, (self.worker_failures,))
+class WorkerGroupStartupTimeoutError(RayTrainError):
+    """Exception raised when the worker group startup times out.
+    Example scenario: 4 GPUs are detected in the cluster, but when the worker
+    are actually scheduled, one of the nodes goes down and only 3 GPUs are
+    available. One of the worker tasks may be stuck pending, until a timeout is reached.
+    """
+    def __init__(self, num_workers: int):
+        timeout = float(
+            os.environ.get(
+                WORKER_GROUP_START_TIMEOUT_S_ENV_VAR,
+                DEFAULT_WORKER_GROUP_START_TIMEOUT_S,
+            )
+        )
+        self.num_workers = num_workers
+        super().__init__(
+            f"The worker group startup timed out after {timeout} seconds waiting "
+            f"for {num_workers} workers. "
+            "Potential causes include: "
+            "(1) temporary insufficient cluster resources while waiting for "
+            "autoscaling (ignore this warning in this case), "
+            "(2) infeasible resource request where the provided `ScalingConfig` "
+            "cannot be satisfied), "
+            "and (3) transient network issues. "
+            f"Set the {WORKER_GROUP_START_TIMEOUT_S_ENV_VAR} "
+            "environment variable to increase the timeout."
+        )
+    def __reduce__(self):
+        return (self.__class__, (self.num_workers,))
+class WorkerGroupStartupFailedError(RayTrainError):
+    """Exception raised when the worker group fails to start.
+    Example scenario: A worker is scheduled onto a node that dies while
+    the worker actor is initializing.
+    """
+class CheckpointManagerInitializationError(RayTrainError):
+    """Exception raised when the checkpoint manager fails to initialize from a snapshot.
+    Example scenarios:
+    1. The checkpoint manager snapshot version is old and
+        incompatible with the current version of Ray Train.
+    2. The checkpoint manager snapshot JSON file is corrupted.
+    3. The checkpoint manager snapshot references checkpoints that cannot be found
+        in the run storage path.
+    """
+class CollectiveTimeoutError(RayTrainError):
+    """Exception raised when an internal Ray Train collective operation of
+    the worker group times out.
+    """
+class BroadcastCollectiveTimeoutError(CollectiveTimeoutError):
+    """Exception raised when the broadcast operation times out.
+    There are two main timeout examples:
+    1. If not all workers call `ray.train.report`, the entire worker group will
+        hang until the timeout before raising. This prevents indefinite worker
+        group hangs.
+    2. If a worker is slow in the training loop and fails to reach the broadcast
+        time, the collective will time out.
+    """
+    def __init__(
+        self, time_elapsed: Optional[float], missing_ranks: List[int], timeout_s: float
+    ):
+        self._time_elapsed = time_elapsed
+        self._missing_ranks = missing_ranks
+        self._timeout_s = timeout_s
+        message = (
+            f"The broadcast operation timed out after {time_elapsed:.2f} seconds. "
+            "Please make sure all worker ranks call `ray.train.report`. \n"
+            f"The following ranks have not called it: {missing_ranks}\n"
+            f"You can set this timeout with the {REPORT_BARRIER_TIMEOUT_S_ENV_VAR} "
+            f"environment variable (current value: {timeout_s:.2f} s)."
+        )
+        super().__init__(message)
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (self._time_elapsed, self._missing_ranks, self._timeout_s),
+        )
+class UserExceptionWithTraceback(RayTrainError):
+    """This class wraps a user code exception raised on the worker
+    with its original traceback string, for logging and debugging purposes.
+    This is needed because the original exception traceback is not serialized
+    with the exception when it is *returned* back to the main process.
+    """
+    def __init__(self, exc: BaseException, traceback_str: str):
+        self._base_exc = exc
+        self._traceback_str = traceback_str
+    def __reduce__(self):
+        return (self.__class__, (self._base_exc, self._traceback_str))
+    def __str__(self):
+        return self._traceback_str

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/callback.cpython-311.pyc ADDED Viewed

Binary file (7.91 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/context.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/controller.cpython-311.pyc ADDED Viewed

Binary file (18.3 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/__pycache__/storage.cpython-311.pyc ADDED Viewed

Binary file (28.2 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/callback.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from ray.train.v2.api.callback import RayTrainCallback
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    from ray.train import Checkpoint
+    from ray.train.v2._internal.execution.controller import TrainControllerState
+    from ray.train.v2._internal.execution.failure_handling import FailureDecision
+    from ray.train.v2._internal.execution.scaling_policy import ScalingDecision
+    from ray.train.v2._internal.execution.worker_group import (
+        WorkerGroup,
+        WorkerGroupStatus,
+    )
+@DeveloperAPI
+class WorkerGroupCallback(RayTrainCallback):
+    def before_init_train_context(
+        self, worker_group: "WorkerGroup"
+    ) -> Dict[str, List[Any]]:
+        """Called before initializing the TrainContext for the worker_group.
+        Return:
+            A dictionary of additional arguments for TrainContext.
+            The key is the argument name and the value is a list of argument values
+            to pass to the TrainContext constructor of each worker in the worker group.
+        """
+        return {}
+    @contextmanager
+    def on_worker_group_start(self):
+        yield
+    def after_worker_group_start(self, worker_group: "WorkerGroup"):
+        """Called after the worker group actors are initialized.
+        All workers should be ready to execute tasks."""
+        pass
+    def after_worker_group_training_start(self, worker_group: "WorkerGroup"):
+        pass
+    @contextmanager
+    def on_worker_group_shutdown(self):
+        yield
+    def before_worker_group_shutdown(self, worker_group: "WorkerGroup"):
+        """Called before the worker group is shut down.
+        Workers may be dead at this point due to actor failures, so this method
+        should catch and handle exceptions if attempting to execute tasks."""
+        pass
+    def after_worker_group_poll_status(self, worker_group_status: "WorkerGroupStatus"):
+        pass
+@DeveloperAPI
+class ControllerCallback(RayTrainCallback):
+    def after_controller_start(self):
+        """Called immediately after `TrainController.run` is called,
+        before the control loop starts executing."""
+        pass
+    def before_controller_shutdown(self):
+        """Called before `TrainController.run` exits,
+        after the control loop has exited."""
+        pass
+    def after_controller_state_update(
+        self,
+        previous_state: "TrainControllerState",
+        current_state: "TrainControllerState",
+    ):
+        """Called whenever the controller state is updated."""
+        pass
+    def before_controller_execute_failure_decision(
+        self,
+        failure_decision: "FailureDecision",
+        worker_group_status: "WorkerGroupStatus",
+    ):
+        """Called before the controller executes a failure decision."""
+        pass
+    def before_controller_execute_scaling_decision(
+        self,
+        scaling_decision: "ScalingDecision",
+        worker_group_status: "WorkerGroupStatus",
+    ):
+        """Called before the controller executes a scaling decision."""
+        pass
+@DeveloperAPI
+class ReportCallback(RayTrainCallback):
+    def after_report(
+        self, metrics: List[Dict[str, Any]], checkpoint: Optional["Checkpoint"]
+    ):
+        """Called after all workers have reported a training result.
+        Note that this differs from `after_worker_group_poll_status`,
+        which may only contain a subset of workers that have reported.
+        For example, if only rank 0 is performing checkpointing, then
+        rank 0 would report a training result the slowest.
+        """
+        pass
+@DeveloperAPI
+class WorkerCallback(RayTrainCallback):
+    """
+    Callbacks that are hooked to the worker event.
+    These callbacks are created on the train driver process and then
+    copied and passed to all the workers.
+    The execution of these callbacks happens on each of the workers,
+    not on the train driver process.
+    """
+    def after_init_train_context(self):
+        pass
+    def before_worker_shutdown(self):
+        pass
+@DeveloperAPI
+class TrainContextCallback(RayTrainCallback):
+    """
+    Callbacks that are hooked to the train context event.
+    These callbacks are created on the train driver process and then
+    copied and passed to all the workers.
+    The execution of these callbacks happens on the train context of the workers.
+    """
+    @contextmanager
+    def on_report(self):
+        yield

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (216 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/checkpoint_manager.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/report_handler.cpython-311.pyc ADDED Viewed

Binary file (5.79 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/__pycache__/sync_actor.cpython-311.pyc ADDED Viewed

Binary file (10.5 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import logging
+from typing import Any, Dict, List, Optional
+from ray.air.config import CheckpointConfig
+from ray.train._checkpoint import Checkpoint
+from ray.train._internal.checkpoint_manager import (
+    _CheckpointManager,
+    _insert_into_sorted_list,
+)
+from ray.train._internal.session import _TrainingResult
+from ray.train.v2._internal.exceptions import CheckpointManagerInitializationError
+from ray.train.v2._internal.execution.callback import ReportCallback
+from ray.train.v2._internal.execution.context import StorageContext
+from ray.train.v2._internal.execution.storage import _delete_fs_path, _exists_at_fs_path
+try:
+    from pydantic import BaseModel
+    from pydantic_core import from_json
+except (ImportError, ModuleNotFoundError) as exc:
+    raise ImportError(
+        "`ray.train.v2` requires the pydantic package, which is missing. "
+        "Run the following command to fix this: `pip install pydantic`"
+    ) from exc
+logger = logging.getLogger(__name__)
+class _TrainingResultState(BaseModel):
+    # Increment version if the schema changes
+    version: int = 0
+    checkpoint_dir_name: str
+    metrics: dict
+class _CheckpointManagerState(BaseModel):
+    # Increment version if the schema changes
+    version: int = 0
+    checkpoint_results: List[_TrainingResultState]
+    latest_checkpoint_result: Optional[_TrainingResultState]
+def _get_training_result_from_state(
+    state: _TrainingResultState,
+    storage_context: StorageContext,
+) -> _TrainingResult:
+    """Get a TrainingResult object from a Pydantic state object."""
+    return _TrainingResult(
+        checkpoint=Checkpoint(
+            path=storage_context.build_checkpoint_path_from_name(
+                state.checkpoint_dir_name
+            ),
+            filesystem=storage_context.storage_filesystem,
+        ),
+        metrics=state.metrics,
+    )
+def _get_state_from_training_result(
+    training_result: _TrainingResult,
+    storage_context: StorageContext,
+) -> _TrainingResultState:
+    """Get a Pydantic state object from a TrainingResult object."""
+    return _TrainingResultState(
+        checkpoint_dir_name=storage_context.extract_checkpoint_dir_name_from_path(
+            training_result.checkpoint.path
+        ),
+        metrics=training_result.metrics,
+    )
+class CheckpointManager(_CheckpointManager, ReportCallback):
+    def __init__(
+        self,
+        checkpoint_config: CheckpointConfig,
+        storage_context: StorageContext,
+    ):
+        self._storage_context = storage_context
+        self._checkpoint_config = checkpoint_config
+        super().__init__(checkpoint_config)
+        # If the snapshot is found, the checkpoint manager will restore its state.
+        self._maybe_load_state_from_storage()
+    def register_checkpoint(self, checkpoint_result: _TrainingResult):
+        """Register new checkpoint and add to bookkeeping.
+        This method will register a new checkpoint and add it to the internal
+        bookkeeping logic. This means the checkpoint manager will decide if
+        this checkpoint should be kept, and if older or worse performing
+        checkpoints should be deleted.
+        Args:
+            checkpoint: Tracked checkpoint object to add to bookkeeping.
+        """
+        self._latest_checkpoint_result = checkpoint_result
+        if self._checkpoint_config.checkpoint_score_attribute is not None:
+            # If we're ordering by a score, insert the checkpoint
+            # so that the list remains sorted.
+            _insert_into_sorted_list(
+                self._checkpoint_results,
+                checkpoint_result,
+                key=self._get_checkpoint_score,
+            )
+        else:
+            # If no metric is provided, just append (ordering by time of registration).
+            self._checkpoint_results.append(checkpoint_result)
+        results_to_delete = {}
+        if self._checkpoint_config.num_to_keep is not None:
+            # Delete the bottom (N - K) checkpoints
+            worst_results = set(
+                self._checkpoint_results[: -self._checkpoint_config.num_to_keep]
+            )
+            # Except for the latest checkpoint.
+            results_to_delete = worst_results - {self._latest_checkpoint_result}
+            # Update internal state before actually deleting them.
+            self._checkpoint_results = [
+                checkpoint_result
+                for checkpoint_result in self._checkpoint_results
+                if checkpoint_result not in results_to_delete
+            ]
+        # Save the checkpoint manager state to storage.
+        # Note: We save the state before deleting the old checkpoints.
+        # If deletion happens first and the process crashes, our snapshot
+        # may point to some stale checkpoints that are already deleted.
+        # TODO: Make this writing operation non-blocking.
+        self._write_state_to_storage()
+        # Delete the old checkpoints.
+        for checkpoint_result in results_to_delete:
+            checkpoint = checkpoint_result.checkpoint
+            logger.debug("Deleting checkpoint: ", checkpoint)
+            _delete_fs_path(fs=checkpoint.filesystem, fs_path=checkpoint.path)
+    # --------------------------
+    # CheckpointManager state
+    # --------------------------
+    def _save_state(self) -> str:
+        """Save the checkpoint manager state to a JSON str."""
+        checkpoint_results = [
+            _get_state_from_training_result(checkpoint_result, self._storage_context)
+            for checkpoint_result in self._checkpoint_results
+        ]
+        latest_checkpoint_result = (
+            _get_state_from_training_result(
+                self._latest_checkpoint_result, self._storage_context
+            )
+            if self._latest_checkpoint_result is not None
+            else None
+        )
+        manager_snapshot = _CheckpointManagerState(
+            checkpoint_results=checkpoint_results,
+            latest_checkpoint_result=latest_checkpoint_result,
+        )
+        return manager_snapshot.model_dump_json()
+    def _load_state(self, json_state: str):
+        """Load the checkpoint manager state from a JSON str."""
+        try:
+            manager_snapshot = _CheckpointManagerState.model_validate(
+                from_json(json_state)
+            )
+        except Exception as e:
+            raise CheckpointManagerInitializationError(repr(e)) from e
+        self._assert_checkpoints_exist()
+        self._checkpoint_results = [
+            _get_training_result_from_state(
+                training_result_state, self._storage_context
+            )
+            for training_result_state in manager_snapshot.checkpoint_results
+        ]
+        self._latest_checkpoint_result = (
+            _get_training_result_from_state(
+                manager_snapshot.latest_checkpoint_result, self._storage_context
+            )
+            if manager_snapshot.latest_checkpoint_result is not None
+            else None
+        )
+    def _maybe_load_state_from_storage(self):
+        """Load the checkpoint manager state from storage.
+        If no snapshot is found, start with a clean state.
+        """
+        if not _exists_at_fs_path(
+            fs=self._storage_context.storage_filesystem,
+            fs_path=self._storage_context.checkpoint_manager_snapshot_path,
+        ):
+            logger.debug(
+                "No checkpoint manager snapshot found. "
+                "No checkpoint will be available via `ray.train.get_checkpoint`, "
+                "so training will start from scratch."
+            )
+            return
+        with self._storage_context.storage_filesystem.open_input_stream(
+            self._storage_context.checkpoint_manager_snapshot_path
+        ) as f:
+            logger.info(
+                "A run snapshot was found in storage folder at: "
+                f"'{self._storage_context.experiment_fs_path}'\n"
+                "This snapshot contains a list of checkpoints reported via "
+                "`ray.train.report` and will be loaded. "
+                "This allows the latest checkpoint found in the snapshot to be "
+                "accessible within your training function via "
+                "`ray.train.get_checkpoint`.\n"
+                "If you meant to start a brand new training job without any "
+                "information about previous checkpoints found in this directory, "
+                "please configure a new, unique `RunConfig(name)` or delete the "
+                f"existing folder at '{self._storage_context.experiment_fs_path}'."
+            )
+            json_state = f.read().decode("utf-8")
+        self._load_state(json_state)
+    def _write_state_to_storage(self):
+        """Write the checkpoint manager state to storage."""
+        checkpoint_manager_snapshot = self._save_state()
+        with self._storage_context.storage_filesystem.open_output_stream(
+            self._storage_context.checkpoint_manager_snapshot_path
+        ) as f:
+            f.write(checkpoint_manager_snapshot.encode("utf-8"))
+    def _assert_checkpoints_exist(self):
+        """Validate the checkpoint manager state.
+        This method will validate the checkpoint manager state by checking if
+        the checkpoints specified in manager snapshot is compatible with the
+        checkpoint folders of the experiment storage filesystem.
+        Raises:
+            CheckpointManagerInitializationError: If the checkpoint manager snapshot
+                is not consistent with the stored checkpoints.
+        """
+        for checkpoint_result in self._checkpoint_results:
+            checkpoint = checkpoint_result.checkpoint
+            assert checkpoint is not None
+            if not _exists_at_fs_path(
+                fs=checkpoint.filesystem, fs_path=checkpoint.path
+            ):
+                raise CheckpointManagerInitializationError(
+                    message=(
+                        "The run snapshot contains a reference to a checkpoint "
+                        f"that does not exist anymore ({checkpoint}). You are "
+                        "running in a corrupted run directory `experiment_fs_path`."
+                        "Please configure a new, unique `RunConfig(name)` "
+                        "or delete the existing folder at "
+                        f"`{self._storage_context.experiment_fs_path}`."
+                    )
+                )
+    # --------------------------
+    # ReportCallback
+    # --------------------------
+    def after_report(
+        self, metrics: List[Dict[str, Any]], checkpoint: Optional[Checkpoint]
+    ):
+        if not checkpoint:
+            return
+        rank_0_metrics = metrics[0]
+        self.register_checkpoint(
+            _TrainingResult(checkpoint=checkpoint, metrics=rank_0_metrics)
+        )

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/report_handler.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from collections import deque
+from typing import TYPE_CHECKING, Deque, List, Optional
+from ray.train.v2._internal.execution.callback import (
+    ReportCallback,
+    WorkerGroupCallback,
+)
+from ray.train.v2._internal.execution.worker_group import WorkerGroup, WorkerGroupStatus
+if TYPE_CHECKING:
+    from ray.train._internal.session import _TrainingResult
+class ReportCallbackHandler(WorkerGroupCallback):
+    """Consolidate training results from multiple workers and call
+    subscribers implementing the `ReportCallback` interface sequentially.
+    """
+    def __init__(self, report_callbacks: List[ReportCallback]):
+        # Number of workers in the current worker group. It is initialized
+        # to be None. It is set to the number of workers when it receives the
+        # worker group status for the first time.
+        # When a worker group shutdown, self._num_workers is set to None,
+        # waiting to be updated when a new worker group status is received again.
+        self._num_workers: Optional[int] = None
+        # A list of queues holding training results from workers.
+        self._training_result_queues: Optional[List[Deque[_TrainingResult]]] = None
+        self._report_callbacks = report_callbacks
+    # --------------------------
+    # WorkerGroupCallback
+    # --------------------------
+    def after_worker_group_poll_status(
+        self, worker_group_status: WorkerGroupStatus
+    ) -> None:
+        """Handle training results as they roll in from worker status polls.
+        Wait for all workers to report training results to collect
+        a consolidated training result.
+        """
+        # Step 1: If self._num_workers is None, we need to initialize the number
+        # of workers and training_results_queues from the worker group status. This
+        # happens when the handler receives the worker group status for the first time.
+        assert (
+            self._num_workers and self._training_result_queues
+        ), "Need to call initialize state with `after_worker_group_start` first."
+        assert self._num_workers == worker_group_status.num_workers, (
+            f"The number of workers in the worker group has changed unexpectedly. "
+            f"Expected: {self._num_workers}, got: {worker_group_status.num_workers}"
+        )
+        # Step 2: Update training_results_queues with poll_results.
+        for i in range(self._num_workers):
+            training_result = worker_group_status.worker_statuses[i].training_result
+            if training_result:
+                self._training_result_queues[i].append(training_result)
+        # Directly return if any of the worker result queues are empty.
+        if not all(self._training_result_queues):
+            return
+        training_results = [q.popleft() for q in self._training_result_queues]
+        # Step 3: Consolidate a list of checkpoints to single checkpoint.
+        # Use the first checkpoint as the consolidated checkpoint.
+        checkpoint_results = [
+            tr for tr in training_results if tr.checkpoint is not None
+        ]
+        consolidated_checkpoint = None
+        if checkpoint_results:
+            # Double check the storage path of the checkpoints in the training results.
+            unique_checkpoint_paths = {tr.checkpoint.path for tr in checkpoint_results}
+            if len(unique_checkpoint_paths) > 1:
+                # TODO: Support for inconsistent checkpoints path from workers
+                # instead of hard raising error. Maybe drop this iteration of
+                # training results and continue with the next iteration.
+                raise RuntimeError(
+                    "The storage path of the checkpoints in the training results "
+                    "is not the same. This means the checkpoints are not consistent."
+                    "Got a mix of the following checkpoint paths: "
+                    f"{unique_checkpoint_paths}\n"
+                    "This is unexpected -- please file a Github issue."
+                )
+            consolidated_checkpoint = checkpoint_results[0].checkpoint
+        # Step 4: Invoke all dependent `ReportCallback`s.
+        metrics_per_worker = [
+            training_result.metrics for training_result in training_results
+        ]
+        for callback in self._report_callbacks:
+            callback.after_report(
+                metrics=metrics_per_worker,
+                checkpoint=consolidated_checkpoint,
+            )
+    def after_worker_group_start(self, worker_group: WorkerGroup) -> None:
+        """Handle worker group start. Initialize internal states."""
+        self._num_workers = len(worker_group)
+        self._training_result_queues = [deque() for _ in range(self._num_workers)]
+    def before_worker_group_shutdown(self, worker_group: WorkerGroup) -> None:
+        """Handle worker group shutdown. Clear internal states.
+        None of the partial reported results are valid at this point.
+        """
+        self._num_workers = None
+        self._training_result_queues = None

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/checkpoint/sync_actor.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import asyncio
+import logging
+from contextlib import contextmanager
+from typing import List, Optional, TypeVar
+import ray
+from ray.train.v2._internal.constants import (
+    DEFAULT_REPORT_BARRIER_TIMEOUT_S,
+    DEFAULT_REPORT_BARRIER_WARN_INTERVAL_S,
+    REPORT_BARRIER_WARN_INTERVAL_S_ENV_VAR,
+)
+from ray.train.v2._internal.exceptions import BroadcastCollectiveTimeoutError
+T = TypeVar("T", bound=Optional[object])
+logger = logging.getLogger(__name__)
+BROADCAST_PERIODIC_WARNING = """
+`ray.train.report` has not been called by all {world_size} workers in the group.
+The workers have been waiting for {max_time_elapsed_s:.2f} s for the following ranks
+to join the `report` call: {missing_ranks}.
+Please ensure that all workers call `ray.train.report` regardless of whether
+they participate in checkpointing or not (e.g., pass `checkpoint=None` for ranks
+that do not save a checkpoint). Also ensure that workers are not hanging on
+other operations, causing them to miss this synchronization barrier.
+You can set the {warn_interval_env_var} environment variable to change the frequency
+of this warning (current value: {warn_interval_s} s).
+"""
+@ray.remote(num_cpus=0)  # type: ignore
+class SynchronizationActor:
+    """A Ray actor that synchronizes the workers in a distributed training job.
+    This actor forms a synchronization barrier on a group of processes.
+    Every time a worker calls the broadcast_from_rank_zero method,
+    the counter is incremented. When the counter equals to the world size,
+    the actor notifies all the workers to continue.
+    """
+    def __init__(
+        self,
+        timeout_s: float = DEFAULT_REPORT_BARRIER_TIMEOUT_S,
+        warn_interval_s: float = DEFAULT_REPORT_BARRIER_WARN_INTERVAL_S,
+    ):
+        self._counter: int = 0
+        self._world_size: int = 0
+        self._condition = asyncio.Condition()
+        self._reduced_data = None
+        # The time when workers from different ranks
+        # enters the synchronization barrier.
+        self._sync_start_times: List[Optional[float]] = []
+        # The timeout in seconds for the synchronization barrier.
+        self._timeout_s: float = timeout_s
+        # The interval in seconds to log a warning when waiting for the barrier.
+        self._warn_interval_s: float = warn_interval_s
+    def get_counter(self):
+        """Returns the current value of the counter."""
+        return self._counter
+    def get_world_size(self):
+        """Returns the current value of the world_size."""
+        return self._world_size
+    def get_reduced_data(self):
+        """Returns the current value of the reduced_data."""
+        return self._reduced_data
+    def _clear_states(self):
+        """Clears the states of the actor. When the last worker has
+        called the _clear_states method, the actor clears its states
+        """
+        self._counter -= 1
+        if self._counter == 0:
+            self._reduced_data = None
+            self._world_size = 0
+    def _setup_or_validate_collective_op(self, world_size: int):
+        """The setup method for the synchronization actor if it is not setup yet.
+        It initializes the world size and the start times for the
+        synchronization barrier.
+        """
+        if self._world_size == 0:
+            self._world_size = world_size
+            self._sync_start_times = [None] * world_size
+        elif world_size != self._world_size:
+            raise ValueError(
+                f"Expected all callers to provide the same world size. \
+                Got {world_size} and expected {self._world_size}."
+            )
+    @contextmanager
+    def _broadcast_collective_context_manager(
+        self, world_rank: int, world_size: int, data: T
+    ):
+        """A context manager that ensures the synchronization barrier is lifted
+        after the block of code is executed.
+        """
+        try:
+            self._setup_or_validate_collective_op(world_size)
+            if world_rank == 0:
+                self._reduced_data = data
+            if self._counter < self._world_size:
+                self._counter += 1
+            yield
+        finally:
+            self._clear_states()
+    def _get_time_elapsed(self) -> Optional[float]:
+        """Return the time elapsed since the first worker entered the barrier.
+        If no workers have entered the barrier, returns None.
+        """
+        start_times = [t for t in self._sync_start_times if t is not None]
+        if not start_times:
+            return None
+        return asyncio.get_event_loop().time() - min(start_times)
+    def _get_missing_ranks(self) -> List[int]:
+        """Returns the ranks that have not entered the synchronization barrier."""
+        return [i for i, t in enumerate(self._sync_start_times) if t is None]
+    async def _wait_with_logging(self, condition, world_rank: int):
+        """Waits for the condition to be notified, logging an warning every
+        `log_interval` seconds, and raises a timeout error if `timeout` is reached.
+        """
+        current_time = asyncio.get_event_loop().time()
+        self._sync_start_times[world_rank] = current_time
+        while True:
+            try:
+                await asyncio.wait_for(condition.wait(), timeout=self._warn_interval_s)
+                return
+            # asyncio.wait_for() raises `asyncio.TimeoutError` for asyncio<=3.10
+            # and raises `TimeoutError` for asyncio>=3.11
+            # https://docs.python.org/3/library/asyncio-task.html#asyncio.wait_for
+            # TODO: (hpguo) Make only one worker log the warning message.
+            except (asyncio.TimeoutError, TimeoutError):
+                logger.warning(
+                    BROADCAST_PERIODIC_WARNING.format(
+                        world_size=self._world_size,
+                        max_time_elapsed_s=self._get_time_elapsed(),
+                        missing_ranks=self._get_missing_ranks(),
+                        warn_interval_env_var=REPORT_BARRIER_WARN_INTERVAL_S_ENV_VAR,
+                        warn_interval_s=self._warn_interval_s,
+                    )
+                )
+    async def broadcast_from_rank_zero(
+        self, world_rank: int, world_size: int, data: T
+    ) -> T:
+        """Broadcasts a data from the worker with rank 0 to all other workers.
+        This method is a coroutine that blocks until all workers have called this
+        method  with the their data. The data from the worker with rank 0 will
+        be returned.
+        """
+        # Ensures that all global states manipulation is done within the async context
+        # manager which makes the condition variable awaiting and the counter
+        # incrementing an atomic operation.
+        async with self._condition:
+            with self._broadcast_collective_context_manager(
+                world_rank, world_size, data
+            ):
+                # If the counter is equal to the world size, it means the last worker
+                # has called the broadcast_from_rank_zero method. The actor notifies
+                # all the workers to continue.
+                if self._counter == self._world_size:
+                    self._condition.notify_all()
+                    return self._reduced_data
+                # If the counter is less than the world size, the actor waits for the
+                # other workers to call the broadcast_from_rank_zero method.
+                try:
+                    await asyncio.wait_for(
+                        self._wait_with_logging(self._condition, world_rank),
+                        timeout=self._timeout_s,
+                    )
+                    return self._reduced_data
+                except (asyncio.TimeoutError, TimeoutError) as e:
+                    raise BroadcastCollectiveTimeoutError(
+                        time_elapsed=self._get_time_elapsed(),
+                        missing_ranks=self._get_missing_ranks(),
+                        timeout_s=self._timeout_s,
+                    ) from e
+    # TODO: Implement a general consensus_from_votes method that takes a callable
+    # reduce_fn and a list of votes from each worker. The method returns the consensus

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/context.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import logging
+import threading
+from dataclasses import dataclass
+from queue import Queue
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+import ray
+from ray.data.iterator import DataIterator
+from ray.train import Checkpoint
+from ray.train._internal import session
+from ray.train._internal.session import _TrainingResult
+from ray.train.v2._internal.execution.checkpoint.sync_actor import SynchronizationActor
+from ray.train.v2._internal.execution.storage import StorageContext
+from ray.train.v2._internal.util import _copy_doc, invoke_context_managers
+from ray.train.v2.api.config import RunConfig
+if TYPE_CHECKING:
+    from ray.train.v2._internal.execution.callback import TrainContextCallback
+    from ray.train.v2._internal.execution.worker_group.thread_runner import ThreadRunner
+logger = logging.getLogger(__file__)
+@dataclass
+class TrainRunContext:
+    """Holds the metadata and context for the current training run."""
+    # TODO: Make this dataclass immutable after refactoring the train context.
+    # The run configuration for the current training run.
+    run_config: RunConfig
+    # TODO: Add more fields that are shared across all workers and controllers.
+    # For example, StorageContext, ScalingConfig, etc.
+    def get_run_config(self) -> RunConfig:
+        """Returns the run config of the current training run."""
+        return self.run_config
+@dataclass(frozen=True)
+class DistributedContext:
+    world_rank: int
+    world_size: int
+    local_rank: int
+    local_world_size: int
+    node_rank: int
+@dataclass(frozen=True)
+class ExecutionContext:
+    """Holds the execution context for the current worker process.
+    Every worker process has a single execution context accessed via the
+    `TrainContext`, which includes the training thread that is actually
+    running the user code.
+    """
+    # A shared synchronization actor that helps broadcast data across ranks.
+    synchronization_actor: SynchronizationActor
+    # A queue that receives training results from the user training code.
+    # `ray.train.report` in user code populates this queue.
+    result_queue: Queue
+    # The thread launcher that runs the user training loop.
+    training_thread_runner: "ThreadRunner"
+    # The callbacks that are run in the worker train context.
+    train_context_callbacks: List["TrainContextCallback"]
+@dataclass
+class TrainContext(TrainRunContext):
+    distributed_context: DistributedContext
+    execution_context: ExecutionContext
+    storage_context: StorageContext
+    dataset_shards: Dict[str, DataIterator]
+    checkpoint: Optional[Checkpoint] = None
+    @_copy_doc(session.get_metadata)
+    def get_metadata(self) -> Dict[str, Any]:
+        raise NotImplementedError
+    @_copy_doc(session.get_experiment_name)
+    def get_experiment_name(self) -> str:
+        # TODO: Resolve run_config.name if it is None
+        return self.run_config.name
+    @_copy_doc(session.get_trial_name)
+    def get_trial_name(self) -> str:
+        raise NotImplementedError
+    @_copy_doc(session.get_trial_id)
+    def get_trial_id(self) -> str:
+        raise NotImplementedError
+    @_copy_doc(session.get_trial_resources)
+    def get_trial_resources(self):
+        raise NotImplementedError
+    @_copy_doc(session.get_trial_dir)
+    def get_trial_dir(self) -> str:
+        raise NotImplementedError
+    @_copy_doc(session.get_world_size)
+    def get_world_size(self) -> int:
+        return self.distributed_context.world_size
+    @_copy_doc(session.get_world_rank)
+    def get_world_rank(self) -> int:
+        return self.distributed_context.world_rank
+    @_copy_doc(session.get_local_rank)
+    def get_local_rank(self) -> int:
+        return self.distributed_context.local_rank
+    @_copy_doc(session.get_local_world_size)
+    def get_local_world_size(self) -> int:
+        return self.distributed_context.local_world_size
+    @_copy_doc(session.get_node_rank)
+    def get_node_rank(self) -> int:
+        return self.distributed_context.node_rank
+    @_copy_doc(session.get_storage)
+    def get_storage(self):
+        return self.storage_context
+    def get_result_queue(self):
+        return self.execution_context.result_queue
+    def get_synchronization_actor(self):
+        return self.execution_context.synchronization_actor
+    def get_checkpoint(self):
+        return self.checkpoint
+    def get_dataset_shard(self, dataset_name: str) -> DataIterator:
+        """Returns the :class:`ray.data.DataIterator` shard for this worker.
+        Call :meth:`~ray.data.DataIterator.iter_torch_batches` or
+        :meth:`~ray.data.DataIterator.to_tf` on this shard to convert it to the
+        appropriate framework-specific data type.
+        Args:
+            dataset_name: Name of the dataset shard.
+        Returns:
+            The ``DataIterator`` shard with the given name for this worker.
+        Raises:
+            KeyError: If the dataset shard with the given name is not found.
+        """
+        try:
+            return self.dataset_shards[dataset_name]
+        except KeyError:
+            raise KeyError(
+                f"Dataset {dataset_name} not found. Available datasets: "
+                f"{list(self.dataset_shards.keys())}."
+            )
+    def get_context_callbacks(self) -> List["TrainContextCallback"]:
+        return self.execution_context.train_context_callbacks
+    def _sync_checkpoint_dir_name_across_ranks(
+        self, checkpoint_dir_name: Optional[str] = None
+    ) -> str:
+        """Sync the checkpoint dir name across ranks.
+        Args:
+            checkpoint_dir_name: The checkpoint dir name to sync.
+        Returns:
+            The synced checkpoint dir name.
+        """
+        # If checkpoint_dir_name is not set, use default checkpoint_dir_name
+        # created by the storage context.
+        checkpoint_dir_name = (
+            checkpoint_dir_name
+            or self.storage_context.make_default_checkpoint_dir_name()
+        )
+        # Get a consensus across ranks on the remote storage path, so distributed
+        # checkpoints will be stored to the same place.
+        sync_actor = self.get_synchronization_actor()
+        return ray.get(
+            sync_actor.broadcast_from_rank_zero.remote(
+                world_rank=self.distributed_context.world_rank,
+                world_size=self.distributed_context.world_size,
+                data=checkpoint_dir_name,
+            )
+        )
+    def _save_checkpoint(
+        self,
+        checkpoint_dir_name: str,
+        metrics: Dict[str, Any],
+        checkpoint: Optional[Checkpoint] = None,
+    ) -> _TrainingResult:
+        """Save the checkpoint to remote storage.
+        Returns:
+            The training result object containing the persisted checkpoint.
+        """
+        if not checkpoint:
+            return _TrainingResult(checkpoint=None, metrics=metrics)
+        # Persist the checkpoint to the remote storage path.
+        persisted_checkpoint = self.storage_context.persist_current_checkpoint(
+            checkpoint, checkpoint_dir_name
+        )
+        # Update latest checkpoint as the persisted checkpoint.
+        self.checkpoint = persisted_checkpoint
+        return _TrainingResult(checkpoint=persisted_checkpoint, metrics=metrics)
+    def report(
+        self,
+        metrics: Dict[str, Any],
+        checkpoint: Optional[Checkpoint] = None,
+        checkpoint_dir_name: Optional[str] = None,
+    ):
+        """
+        Upload checkpoint to remote storage and put a training
+        result on the result queue of this worker process.
+        Args:
+            metrics: The metrics to report.
+            checkpoint: The checkpoint to report.
+            checkpoint_dir_name: The name of the checkpoint dir
+                in this iteration. Note: If not set, the checkpoint will
+                be stored in the default storage path. If set, make sure
+                this value is unique for each iteration.
+        TODO: the report function should be implemented in the worker instead
+        of in the train context. The train context should only keep the train
+        related information and not the worker related actions. This refactor
+        would also require the `TrainContextCallback` to be updated as well.
+        """
+        with invoke_context_managers(
+            [
+                callback.on_report
+                for callback in self.execution_context.train_context_callbacks
+            ]
+        ):
+            # Step 1: sync the checkpoint dir name across ranks.
+            checkpoint_dir_name = self._sync_checkpoint_dir_name_across_ranks(
+                checkpoint_dir_name
+            )
+            # Step 2: save the checkpoint to remote storage.
+            training_result = self._save_checkpoint(
+                checkpoint_dir_name, metrics, checkpoint
+            )
+            # Step 3: Report the training result to the result queue.
+            # The queue size is set to 1 to avoid accumulating unprocessed results.
+            # If the queue is full, the put operation blocks until a result is consumed.
+            # TODO (hpguo): Add a metrics to track the blocking time waiting for the
+            # training result to be consumed by the controller.
+            self.get_result_queue().put(training_result)
+# The global variable holding the current TrainContext
+_train_context: Optional[TrainContext] = None
+# Thread lock to protect the global TrainContext
+_context_lock = threading.Lock()
+def get_train_context() -> TrainContext:
+    with _context_lock:
+        if _train_context is None:
+            raise RuntimeError("TrainContext has not been initialized.")
+        return _train_context
+def set_train_context(context) -> None:
+    global _train_context
+    with _context_lock:
+        _train_context = context

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/controller.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import logging
+import os
+import time
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+from ray._private.auto_init_hook import wrap_auto_init
+from ray.train import Checkpoint
+from ray.train.v2._internal.constants import (
+    DEFAULT_HEALTH_CHECK_INTERVAL_S,
+    HEALTH_CHECK_INTERVAL_S_ENV_VAR,
+)
+from ray.train.v2._internal.exceptions import (
+    TrainingFailedError,
+    WorkerGroupStartupFailedError,
+    WorkerGroupStartupTimeoutError,
+)
+from ray.train.v2._internal.execution.callback import (
+    ControllerCallback,
+    ReportCallback,
+    TrainContextCallback,
+    WorkerCallback,
+    WorkerGroupCallback,
+)
+from ray.train.v2._internal.execution.checkpoint.checkpoint_manager import (
+    CheckpointManager,
+)
+from ray.train.v2._internal.execution.checkpoint.report_handler import (
+    ReportCallbackHandler,
+)
+from ray.train.v2._internal.execution.context import TrainRunContext
+from ray.train.v2._internal.execution.failure_handling import (
+    FailureDecision,
+    FailurePolicy,
+)
+from ray.train.v2._internal.execution.scaling_policy import (
+    ResizeDecision,
+    ScalingDecision,
+    ScalingPolicy,
+)
+from ray.train.v2._internal.execution.storage import StorageContext, get_fs_and_path
+from ray.train.v2._internal.execution.worker_group import WorkerGroup, WorkerGroupStatus
+from ray.train.v2._internal.logging.logging import configure_controller_logger
+from ray.train.v2._internal.util import time_monotonic
+from ray.train.v2.api.result import Result
+from ray.train.v2.api.callback import RayTrainCallback
+logger = logging.getLogger(__name__)
+class TrainControllerState(Enum):
+    """The possible states that the training controller can be in
+    while running the main execution control loop.
+    States:
+        RUNNING: The training controller is actively running training tasks.
+        RECOVERING: The training controller is in the process of recovering
+            from an error.
+        INITIALIZING: The train controller is starting up.
+            This is always the initial state of the controller.
+        ERRORED: A terminal state indicating that training has encountered
+            an error and cannot continue.
+        FINISHED: A terminal state indicating that training has completed.
+    """
+    RUNNING = "RUNNING"
+    INITIALIZING = "INITIALIZING"
+    RECOVERING = "RECOVERING"
+    ERRORED = "ERRORED"
+    FINISHED = "FINISHED"
+class TrainController:
+    """Manages the execution of a distributed training job.
+    Responsibilities include:
+    * Triggering the training function to run on the worker group.
+    * Monitoring the status of the worker group.
+    * Handling scaling decisions by restarting the worker group.
+    * Handling failure decisions by restarting the worker group or terminating training.
+    * Running callback logic on different hooks in the control loop.
+    """
+    worker_group_cls = WorkerGroup
+    def __init__(
+        self,
+        train_fn: Callable[[Dict[str, Any]], None],
+        train_run_context: TrainRunContext,
+        scaling_policy: ScalingPolicy,
+        failure_policy: FailurePolicy,
+        callbacks: Optional[List[RayTrainCallback]] = None,
+        # TODO: [Deprecation]
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        self._train_run_context = train_run_context
+        configure_controller_logger(self._train_run_context)
+        self._train_fn = train_fn
+        self._scaling_policy = scaling_policy
+        self._failure_policy = failure_policy
+        self._run_config = self._train_run_context.run_config
+        self._callbacks = callbacks or []
+        self._resume_from_checkpoint = resume_from_checkpoint
+        self._storage_context = StorageContext(
+            storage_path=self._run_config.storage_path,
+            experiment_dir_name=self._run_config.name,
+            storage_filesystem=self._run_config.storage_filesystem,
+        )
+        self._checkpoint_manager = CheckpointManager(
+            checkpoint_config=self._run_config.checkpoint_config,
+            storage_context=self._storage_context,
+        )
+        report_handler = ReportCallbackHandler(
+            report_callbacks=(
+                [self._checkpoint_manager]
+                + [c for c in self._callbacks if isinstance(c, ReportCallback)]
+            )
+        )
+        # Group callbacks by the hooks they're subscribed to.
+        self._controller_callbacks = [self._scaling_policy] + [
+            c for c in self._callbacks if isinstance(c, ControllerCallback)
+        ]
+        # Group callbacks that will be propagated to the worker group,
+        # train worker and the train context.
+        worker_group_callbacks_to_propagate = [report_handler] + [
+            c
+            for c in self._callbacks
+            if isinstance(
+                c, (WorkerGroupCallback, WorkerCallback, TrainContextCallback)
+            )
+        ]
+        self._worker_group = self.worker_group_cls(
+            train_run_context=self._train_run_context,
+            callbacks=worker_group_callbacks_to_propagate,
+        )
+        self._state = TrainControllerState.INITIALIZING
+        self._latest_poll_time = float("-inf")
+        self._health_check_interval_s = float(
+            os.getenv(HEALTH_CHECK_INTERVAL_S_ENV_VAR, DEFAULT_HEALTH_CHECK_INTERVAL_S)
+        )
+        self._training_failed_error: Optional[TrainingFailedError] = None
+    def _execute_scaling_decision(
+        self, decision: ScalingDecision, worker_group_status: WorkerGroupStatus
+    ):
+        """Executes scaling decisions."""
+        for callback in self._controller_callbacks:
+            callback.before_controller_execute_scaling_decision(
+                decision, worker_group_status
+            )
+        if isinstance(decision, ResizeDecision):
+            self._restart_worker_group(
+                num_workers=decision.num_workers,
+                resources_per_worker=decision.resources_per_worker,
+            )
+    def _execute_failure_decision(
+        self, failure_decision: FailureDecision, worker_group_status: WorkerGroupStatus
+    ):
+        """Executes failure handling decisions (ex: restart, terminate)."""
+        assert worker_group_status.errors
+        for callback in self._controller_callbacks:
+            callback.before_controller_execute_failure_decision(
+                failure_decision, worker_group_status
+            )
+        if failure_decision == FailureDecision.NOOP:
+            assert self._state == TrainControllerState.RUNNING
+            return
+        errors_str = "\n".join(
+            [
+                f"[Rank {worker_rank}]\n{error}"
+                for worker_rank, error in worker_group_status.errors.items()
+            ]
+        )
+        if failure_decision == FailureDecision.RESTART:
+            logger.error(
+                "Restarting training worker group after encountering "
+                f"failures on {len(worker_group_status.errors)} worker(s):\n"
+                f"{errors_str}"
+            )
+            # Shutdown the worker group so that we don't keep polling errored tasks.
+            self._worker_group.shutdown()
+            self._set_state(TrainControllerState.RECOVERING)
+        elif failure_decision == FailureDecision.RAISE:
+            logger.error(
+                "Terminating training worker group after encountering "
+                f"failure(s) on {len(worker_group_status.errors)} worker(s):\n"
+                f"{errors_str}"
+            )
+            self._set_state(TrainControllerState.ERRORED)
+            self._training_failed_error = TrainingFailedError(
+                worker_failures=worker_group_status.errors
+            )
+        else:
+            raise ValueError(f"Unexpected failure decision: {failure_decision}")
+    def _poll_workers(self) -> WorkerGroupStatus:
+        # Ensure that the time between polls is at least HEALTH_CHECK_INTERVAL_S.
+        time_since_last_poll = time_monotonic() - self._latest_poll_time
+        if time_since_last_poll < self._health_check_interval_s:
+            remaining_time = max(
+                self._health_check_interval_s - time_since_last_poll, 0
+            )
+            time.sleep(remaining_time)
+        status = self._worker_group.poll_status(timeout=self._health_check_interval_s)
+        self._latest_poll_time = time_monotonic()
+        return status
+    def _restart_worker_group(self, num_workers: int, resources_per_worker: dict):
+        """Restart the worker group and launch the train function."""
+        self._worker_group.shutdown()
+        # If there's a latest checkpoint that's been committed,
+        # use it to restore the worker group.
+        latest_checkpoint_result = self._checkpoint_manager.latest_checkpoint_result
+        latest_checkpoint = (
+            latest_checkpoint_result.checkpoint if latest_checkpoint_result else None
+        )
+        placement_strategy = self._scaling_policy.scaling_config.placement_strategy
+        # Start the worker group with the latest checkpoint if there is one.
+        # Otherwise, start the worker group with the checkpoint set by controller.
+        # Finally, if there is no checkpoint, start the worker group with None.
+        try:
+            self._worker_group.start(
+                train_fn=self._train_fn,
+                num_workers=num_workers,
+                resources_per_worker=resources_per_worker,
+                placement_strategy=placement_strategy,
+                checkpoint=latest_checkpoint or self._resume_from_checkpoint,
+            )
+        except (WorkerGroupStartupTimeoutError, WorkerGroupStartupFailedError) as e:
+            logger.error(
+                "Retrying the launch of the training worker group. "
+                f"The previous launch attempt encountered the following failure:\n{e}"
+            )
+            # TODO: Should this logic go through the failure policy?
+            # The current logic will always try recovering unconditionally
+            # on startup errors without a retry limit.
+            self._set_state(TrainControllerState.RECOVERING)
+            return
+        # TODO: Consider starting the worker group asynchronously.
+        self._set_state(TrainControllerState.RUNNING)
+    def _start(self):
+        for callback in self._controller_callbacks:
+            callback.after_controller_start()
+    def _shutdown(self):
+        self._worker_group.shutdown()
+        for callback in self._controller_callbacks:
+            callback.before_controller_shutdown()
+    def get_worker_group(self) -> WorkerGroup:
+        return self._worker_group
+    def get_state(self) -> TrainControllerState:
+        return self._state
+    def _set_state(self, state: TrainControllerState):
+        previous_state = self._state
+        self._state = state
+        for callback in self._controller_callbacks:
+            callback.after_controller_state_update(previous_state, state)
+    def _run_control_loop_iteration(self):
+        """Run a single iteration of the control loop.
+        Steps:
+        1. Poll the worker group for status.
+        2. If the worker group is initializing or recovering from an error,
+            make a scaling decision and execute it.
+        3. If the worker group has finished, set the controller state to FINISHED.
+        4. If the worker group has errors, make a failure decision and execute it.
+        5. Otherwise, the worker group is running healthily.
+            Query the scaling policy for a scaling decision and execute it.
+        """
+        assert self.get_state() in (
+            TrainControllerState.RUNNING,
+            TrainControllerState.RECOVERING,
+            TrainControllerState.INITIALIZING,
+        ), self.get_state()
+        worker_group_status = self._poll_workers()
+        if worker_group_status.finished and not worker_group_status.errors:
+            self._set_state(TrainControllerState.FINISHED)
+            return
+        if self.get_state() in (
+            TrainControllerState.INITIALIZING,
+            TrainControllerState.RECOVERING,
+        ):
+            scaling_decision = (
+                self._scaling_policy.make_decision_for_non_running_worker_group(
+                    worker_group_status
+                )
+            )
+            self._execute_scaling_decision(scaling_decision, worker_group_status)
+        elif self.get_state() == TrainControllerState.RUNNING:
+            if worker_group_status.errors:
+                failure_decision = self._failure_policy.make_decision(
+                    worker_group_status
+                )
+                self._execute_failure_decision(failure_decision, worker_group_status)
+            else:
+                scaling_decision = (
+                    self._scaling_policy.make_decision_for_running_worker_group(
+                        worker_group_status
+                    )
+                )
+                self._execute_scaling_decision(scaling_decision, worker_group_status)
+    @wrap_auto_init
+    def run(self):
+        """Run the main control loop. Exits when training is finished or errored."""
+        self._start()
+        while self.get_state() not in (
+            TrainControllerState.ERRORED,
+            TrainControllerState.FINISHED,
+        ):
+            self._run_control_loop_iteration()
+        self._shutdown()
+    def get_result(self) -> Result:
+        """Get the final training result from the TrainController."""
+        controller_state = self.get_state()
+        if controller_state not in (
+            TrainControllerState.FINISHED,
+            TrainControllerState.ERRORED,
+        ):
+            raise ValueError(
+                f"Cannot get result when controller is in state {controller_state}"
+            )
+        latest_checkpoint_result = self._checkpoint_manager.latest_checkpoint_result
+        latest_metrics = (
+            latest_checkpoint_result.metrics if latest_checkpoint_result else None
+        )
+        latest_checkpoint = (
+            latest_checkpoint_result.checkpoint if latest_checkpoint_result else None
+        )
+        best_checkpoints = [
+            (r.checkpoint, r.metrics)
+            for r in self._checkpoint_manager.best_checkpoint_results
+        ]
+        storage_filesystem, storage_fs_path = get_fs_and_path(
+            self._run_config.storage_path, self._run_config.storage_filesystem
+        )
+        experiment_fs_path = Path(storage_fs_path, self._run_config.name).as_posix()
+        return Result(
+            metrics=latest_metrics,
+            checkpoint=latest_checkpoint,
+            error=self._training_failed_error,
+            path=experiment_fs_path,
+            best_checkpoints=best_checkpoints,
+            _storage_filesystem=storage_filesystem,
+        )

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (511 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/default.cpython-311.pyc ADDED Viewed

Binary file (2.67 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/factory.cpython-311.pyc ADDED Viewed

Binary file (797 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/__pycache__/failure_policy.cpython-311.pyc ADDED Viewed

Binary file (1.7 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/default.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import logging
+from ray.train import FailureConfig
+from ray.train.v2._internal.execution.failure_handling import (
+    FailureDecision,
+    FailurePolicy,
+)
+from ray.train.v2._internal.execution.worker_group import WorkerGroupStatus
+logger = logging.getLogger(__name__)
+class DefaultFailurePolicy(FailurePolicy):
+    def __init__(self, failure_config: FailureConfig):
+        super().__init__(failure_config)
+        self._total_failures = 0
+    def make_decision(self, worker_group_status: WorkerGroupStatus) -> FailureDecision:
+        if not worker_group_status.errors:
+            return FailureDecision.NOOP
+        self._total_failures += 1
+        if self.failure_config.max_failures == -1:
+            logger.info(
+                "Deciding to RESTART, since infinite retry is enabled. "
+                f"Encountered {self._total_failures} failures so far."
+            )
+            return FailureDecision.RESTART
+        if self._total_failures > self.failure_config.max_failures:
+            logger.info(
+                "Deciding to TERMINATE, since the total failure count "
+                f"({self._total_failures}) exceeded the maximum allowed failures: "
+                f"FailureConfig(max_failures={self.failure_config.max_failures})."
+            )
+            return FailureDecision.RAISE
+        logger.info(
+            "Deciding to RESTART, since the total "
+            f"failure count ({self._total_failures}) <= "
+            f"FailureConfig(max_failures={self.failure_config.max_failures})."
+        )
+        return FailureDecision.RESTART

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/failure_handling/factory.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from ray.train import FailureConfig
+from ray.train.v2._internal.execution.failure_handling import (
+    DefaultFailurePolicy,
+    FailurePolicy,
+)
+def create_failure_policy(failure_config: FailureConfig) -> FailurePolicy:
+    """Create a failure policy from the given failure config.
+    Defaults to the `DefaultFailurePolicy` implementation.
+    """
+    return DefaultFailurePolicy(failure_config=failure_config)

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# isort: off
+from .scaling_policy import ScalingDecision, ScalingPolicy, NoopDecision, ResizeDecision
+from .fixed import FixedScalingPolicy
+from .factory import create_scaling_policy
+# isort: on
+__all__ = [
+    "ScalingPolicy",
+    "FixedScalingPolicy",
+    "ScalingDecision",
+    "NoopDecision",
+    "ResizeDecision",
+    "create_scaling_policy",
+]
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__pycache__/factory.cpython-311.pyc ADDED Viewed

Binary file (805 Bytes). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__pycache__/fixed.cpython-311.pyc ADDED Viewed

Binary file (1.54 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/__pycache__/scaling_policy.cpython-311.pyc ADDED Viewed

Binary file (3.09 kB). View file

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/factory.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from ray.train.v2._internal.execution.scaling_policy import (
+    FixedScalingPolicy,
+    ScalingPolicy,
+)
+from ray.train.v2.api.config import ScalingConfig
+def create_scaling_policy(scaling_config: ScalingConfig) -> ScalingPolicy:
+    """Create a scaling policy from the given scaling config.
+    Defaults to the `FixedScalingPolicy` implementation.
+    """
+    return FixedScalingPolicy(scaling_config=scaling_config)

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/fixed.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from ray.train.v2._internal.execution.scaling_policy import (
+    NoopDecision,
+    ResizeDecision,
+    ScalingDecision,
+    ScalingPolicy,
+)
+from ray.train.v2._internal.execution.worker_group import WorkerGroupStatus
+class FixedScalingPolicy(ScalingPolicy):
+    def make_decision_for_non_running_worker_group(
+        self, worker_group_status: WorkerGroupStatus
+    ) -> ScalingDecision:
+        return ResizeDecision(
+            num_workers=self.scaling_config.num_workers,
+            resources_per_worker=self.scaling_config._resources_per_worker_not_none,
+        )
+    def make_decision_for_running_worker_group(
+        self, worker_group_status: WorkerGroupStatus
+    ) -> ScalingDecision:
+        return NoopDecision()

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/scaling_policy/scaling_policy.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import abc
+from dataclasses import dataclass
+from typing import Dict
+from ray.train.v2._internal.execution.callback import ControllerCallback
+from ray.train.v2._internal.execution.worker_group import WorkerGroupStatus
+from ray.train.v2.api.config import ScalingConfig
+@dataclass
+class ScalingDecision:
+    pass
+@dataclass
+class NoopDecision(ScalingDecision):
+    pass
+@dataclass
+class ResizeDecision(ScalingDecision):
+    num_workers: int
+    resources_per_worker: Dict[str, float]
+class ScalingPolicy(abc.ABC, ControllerCallback):
+    """A policy that determines when and how to scale a worker group.
+    This can be used to implement elasticity and fault tolerance.
+    Recovery decisions are made when workers are in an inactive or unhealthy state.
+    Upscale decisions are optional and are made when workers are healthy.
+    """
+    def __init__(self, scaling_config: ScalingConfig):
+        self.scaling_config = scaling_config
+    @abc.abstractmethod
+    def make_decision_for_non_running_worker_group(
+        self, worker_group_status: WorkerGroupStatus
+    ) -> ScalingDecision:
+        """Makes a scaling decision when the worker group is initializing
+        or recovering from an error."""
+        raise NotImplementedError
+    @abc.abstractmethod
+    def make_decision_for_running_worker_group(
+        self, worker_group_status: WorkerGroupStatus
+    ) -> ScalingDecision:
+        """Makes a scaling decision when monitoring healthy, running workers."""
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/ray/train/v2/_internal/execution/storage.py ADDED Viewed

	@@ -0,0 +1,551 @@

+# Try import ray[train] core requirements (defined in setup.py)
+# isort: off
+try:
+    import fsspec  # noqa
+    from fsspec.implementations.local import LocalFileSystem
+except (ImportError, ModuleNotFoundError) as e:
+    raise RuntimeError(
+        "fsspec is a required dependency of Ray Train and Ray Tune. "
+        "Please install with: `pip install fsspec`"
+    ) from e
+try:
+    import pyarrow
+    import pyarrow.fs
+except (ImportError, ModuleNotFoundError) as e:
+    raise RuntimeError(
+        "pyarrow is a required dependency of Ray Train and Ray Tune. "
+        "Please install with: `pip install pyarrow`"
+    ) from e
+# isort: on
+import fnmatch
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Type, Union
+from ray.air._internal.filelock import TempFileLock
+from ray.train.constants import _get_ray_train_session_dir
+from ray.train.v2._internal.constants import (
+    CHECKPOINT_MANAGER_SNAPSHOT_FILENAME,
+    VALIDATE_STORAGE_MARKER_FILENAME,
+)
+from ray.train.v2._internal.util import date_str
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    from ray.train import Checkpoint
+logger = logging.getLogger(__name__)
+class _ExcludingLocalFilesystem(LocalFileSystem):
+    """LocalFileSystem wrapper to exclude files according to patterns.
+    Args:
+        root_path: Root path to strip when matching with the exclude pattern.
+            Ex: root_path="/tmp/a/b/c", exclude=["*a*"], will exclude
+            /tmp/a/b/c/_a_.txt but not ALL of /tmp/a/*.
+        exclude: List of patterns that are applied to files returned by
+            ``self.find()``. If a file path matches this pattern, it will
+            be excluded.
+    """
+    def __init__(self, root_path: Path, exclude: List[str], **kwargs):
+        super().__init__(**kwargs)
+        self._exclude = exclude
+        self._root_path = root_path
+    @property
+    def fsid(self):
+        return "_excluding_local"
+    def _should_exclude(self, path: str) -> bool:
+        """Return True if `path` (relative to `root_path`) matches any of the
+        `self._exclude` patterns."""
+        path = Path(path)
+        relative_path = path.relative_to(self._root_path).as_posix()
+        match_candidates = [relative_path]
+        if path.is_dir():
+            # Everything is in posix path format ('/')
+            match_candidates.append(relative_path + "/")
+        for excl in self._exclude:
+            if any(fnmatch.fnmatch(candidate, excl) for candidate in match_candidates):
+                return True
+        return False
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        """Call parent find() and exclude from result."""
+        paths = super().find(
+            path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+        )
+        if detail:
+            return {
+                path: out
+                for path, out in paths.items()
+                if not self._should_exclude(path)
+            }
+        else:
+            return [path for path in paths if not self._should_exclude(path)]
+def _pyarrow_fs_copy_files(
+    source, destination, source_filesystem=None, destination_filesystem=None, **kwargs
+):
+    if isinstance(destination_filesystem, pyarrow.fs.S3FileSystem):
+        # Workaround multi-threading issue with pyarrow. Note that use_threads=True
+        # is safe for download, just not for uploads, see:
+        # https://github.com/apache/arrow/issues/32372
+        kwargs.setdefault("use_threads", False)
+    # Use a large chunk size to speed up large checkpoint transfers.
+    kwargs.setdefault("chunk_size", 64 * 1024 * 1024)
+    return pyarrow.fs.copy_files(
+        source,
+        destination,
+        source_filesystem=source_filesystem,
+        destination_filesystem=destination_filesystem,
+        **kwargs,
+    )
+# TODO(justinvyu): Add unit tests for all these utils.
+def _delete_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str):
+    is_dir = _is_directory(fs, fs_path)
+    try:
+        if is_dir:
+            fs.delete_dir(fs_path)
+        else:
+            fs.delete_file(fs_path)
+    except Exception:
+        logger.exception(f"Caught exception when deleting path at ({fs}, {fs_path}):")
+def _download_from_fs_path(
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    local_path: str,
+    filelock: bool = True,
+):
+    """Downloads a directory or file from (fs, fs_path) to a local path.
+    If fs_path points to a directory:
+    - The full directory contents are downloaded directly into `local_path`,
+      rather than to a subdirectory of `local_path`.
+    If fs_path points to a file:
+    - The file is downloaded to `local_path`, which is expected to be a file path.
+    If the download fails, the `local_path` contents are
+    cleaned up before raising, if the directory did not previously exist.
+    NOTE: This method creates `local_path`'s parent directories if they do not
+    already exist. If the download fails, this does NOT clean up all the parent
+    directories that were created.
+    Args:
+        fs: The filesystem to download from.
+        fs_path: The filesystem path (either a directory or a file) to download.
+        local_path: The local path to download to.
+        filelock: Whether to require a file lock before downloading, useful for
+            multiple downloads to the same directory that may be happening in parallel.
+    Raises:
+        FileNotFoundError: if (fs, fs_path) doesn't exist.
+    """
+    _local_path = Path(local_path).resolve()
+    exists_before = _local_path.exists()
+    if _is_directory(fs=fs, fs_path=fs_path):
+        _local_path.mkdir(parents=True, exist_ok=True)
+    else:
+        _local_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        if filelock:
+            with TempFileLock(f"{os.path.normpath(local_path)}.lock"):
+                _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
+        else:
+            _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
+    except Exception as e:
+        # Clean up the directory if downloading was unsuccessful
+        if not exists_before:
+            shutil.rmtree(local_path, ignore_errors=True)
+        raise e
+def _upload_to_fs_path(
+    local_path: str,
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    exclude: Optional[List[str]] = None,
+) -> None:
+    """Uploads a local directory or file to (fs, fs_path).
+    NOTE: This will create all necessary parent directories at the destination.
+    Args:
+        local_path: The local path to upload.
+        fs: The filesystem to upload to.
+        fs_path: The filesystem path where the dir/file will be uploaded to.
+        exclude: A list of filename matches to exclude from upload. This includes
+            all files under subdirectories as well.
+            This pattern will match with the relative paths of all files under
+            `local_path`.
+            Ex: ["*.png"] to exclude all .png images.
+    """
+    if not exclude:
+        # TODO(justinvyu): uploading a single file doesn't work
+        # (since we always create a directory at fs_path)
+        _create_directory(fs=fs, fs_path=fs_path)
+        _pyarrow_fs_copy_files(local_path, fs_path, destination_filesystem=fs)
+        return
+    _upload_to_uri_with_exclude_fsspec(
+        local_path=local_path, fs=fs, fs_path=fs_path, exclude=exclude
+    )
+def _upload_to_uri_with_exclude_fsspec(
+    local_path: str, fs: "pyarrow.fs", fs_path: str, exclude: Optional[List[str]]
+) -> None:
+    local_fs = _ExcludingLocalFilesystem(root_path=local_path, exclude=exclude)
+    handler = pyarrow.fs.FSSpecHandler(local_fs)
+    source_fs = pyarrow.fs.PyFileSystem(handler)
+    _create_directory(fs=fs, fs_path=fs_path)
+    _pyarrow_fs_copy_files(
+        local_path, fs_path, source_filesystem=source_fs, destination_filesystem=fs
+    )
+def _list_at_fs_path(
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    file_filter: Callable[[pyarrow.fs.FileInfo], bool] = lambda x: True,
+) -> List[str]:
+    """Returns the list of filenames at (fs, fs_path), similar to os.listdir.
+    If the path doesn't exist, returns an empty list.
+    """
+    selector = pyarrow.fs.FileSelector(fs_path, allow_not_found=True, recursive=False)
+    return [
+        os.path.relpath(file_info.path.lstrip("/"), start=fs_path.lstrip("/"))
+        for file_info in fs.get_file_info(selector)
+        if file_filter(file_info)
+    ]
+def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool:
+    """Returns True if (fs, fs_path) exists."""
+    valid = fs.get_file_info(fs_path)
+    return valid.type != pyarrow.fs.FileType.NotFound
+def _is_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool:
+    """Checks if (fs, fs_path) is a directory or a file.
+    Raises:
+        FileNotFoundError: if (fs, fs_path) doesn't exist.
+    """
+    file_info = fs.get_file_info(fs_path)
+    if file_info.type == pyarrow.fs.FileType.NotFound:
+        raise FileNotFoundError(f"Path not found: ({fs}, {fs_path})")
+    return not file_info.is_file
+def _create_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> None:
+    """Create directory at (fs, fs_path).
+    Some external filesystems require directories to already exist, or at least
+    the `netloc` to be created (e.g. PyArrows ``mock://`` filesystem).
+    Generally this should be done before and outside of Ray applications. This
+    utility is thus primarily used in testing, e.g. of ``mock://` URIs.
+    """
+    try:
+        fs.create_dir(fs_path)
+    except Exception:
+        logger.exception(
+            f"Caught exception when creating directory at ({fs}, {fs_path}):"
+        )
+def get_fs_and_path(
+    storage_path: Union[str, os.PathLike],
+    storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+) -> Tuple[pyarrow.fs.FileSystem, str]:
+    """Returns the fs and path from a storage path and an optional custom fs.
+    Args:
+        storage_path: A storage path or URI. (ex: s3://bucket/path or /tmp/ray_results)
+        storage_filesystem: A custom filesystem to use. If not provided,
+            this will be auto-resolved by pyarrow. If provided, the storage_path
+            is assumed to be prefix-stripped already, and must be a valid path
+            on the filesystem.
+    """
+    storage_path = str(storage_path)
+    if storage_filesystem:
+        return storage_filesystem, storage_path
+    return pyarrow.fs.FileSystem.from_uri(storage_path)
+@DeveloperAPI
+class StorageContext:
+    """Shared context that holds the source of truth for all paths and
+    storage utilities, passed along from the driver to workers.
+    This object defines a few types of paths:
+    1. *_fs_path: A path on the `storage_filesystem`. This is a regular path
+        which has been prefix-stripped by pyarrow.fs.FileSystem.from_uri and
+        can be joined with `Path(...).as_posix()`.
+    2. *_driver_staging_path: The temporary staging directory on the local filesystem
+        where driver artifacts are saved to before persisting them to storage.
+    3. trial_working_directory: The local filesystem path that the remote
+        actors' working directories are moved to by default.
+        This is separated from the driver staging path so that driver syncing
+        does not implicitly upload the trial working directory, for trials on the
+        driver node.
+    Example with storage_path="mock:///bucket/path?param=1":
+        >>> import ray
+        >>> from ray.train._internal.storage import StorageContext
+        >>> import os
+        >>> _ = ray.init()
+        >>> storage = StorageContext(
+        ...     storage_path="mock://netloc/bucket/path?param=1",
+        ...     experiment_dir_name="exp_name",
+        ... )
+        >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
+        <pyarrow._fs._MockFileSystem object...
+        >>> storage.experiment_fs_path
+        'bucket/path/exp_name'
+        >>> storage.experiment_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts'
+        >>> storage.trial_dir_name = "trial_dir"
+        >>> storage.trial_fs_path
+        'bucket/path/exp_name/trial_dir'
+        >>> storage.trial_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts/trial_dir'
+        >>> storage.trial_working_directory   # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/working_dirs/trial_dir'
+        >>> ray.shutdown()
+    Example with storage_path="/tmp/ray_results":
+        >>> from ray.train._internal.storage import StorageContext
+        >>> storage = StorageContext(
+        ...     storage_path="/tmp/ray_results",
+        ...     experiment_dir_name="exp_name",
+        ... )
+        >>> storage.storage_fs_path
+        '/tmp/ray_results'
+        >>> storage.experiment_fs_path
+        '/tmp/ray_results/exp_name'
+        >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
+        <pyarrow._fs.LocalFileSystem object...
+    Internal Usage Examples:
+    - To copy files to the trial directory on the storage filesystem:
+        pyarrow.fs.copy_files(
+            local_dir,
+            Path(storage.trial_fs_path, "subdir").as_posix(),
+            destination_filesystem=storage.filesystem
+        )
+    .. warning::
+        This is an experimental developer API and is subject to change
+        without notice between versions.
+    """
+    def __init__(
+        self,
+        storage_path: Union[str, os.PathLike],
+        experiment_dir_name: str,
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    ):
+        self.custom_fs_provided = storage_filesystem is not None
+        # Invariant: (`storage_filesystem`, `storage_path`) is the location where
+        # *all* results can be accessed.
+        self.experiment_dir_name = experiment_dir_name
+        self.storage_filesystem, self.storage_fs_path = get_fs_and_path(
+            storage_path, storage_filesystem
+        )
+        self.storage_fs_path = Path(self.storage_fs_path).as_posix()
+        self._create_validation_file()
+        self._check_validation_file()
+    def __str__(self):
+        return (
+            "StorageContext<\n"
+            f"  storage_filesystem='{self.storage_filesystem.type_name}',\n"
+            f"  storage_fs_path='{self.storage_fs_path}',\n"
+            f"  experiment_dir_name='{self.experiment_dir_name}',\n"
+            ">"
+        )
+    def _create_validation_file(self):
+        """On the creation of a storage context, create a validation file at the
+        storage path to verify that the storage path can be written to.
+        This validation file is also used to check whether the storage path is
+        accessible by all nodes in the cluster."""
+        valid_file = Path(
+            self.experiment_fs_path, VALIDATE_STORAGE_MARKER_FILENAME
+        ).as_posix()
+        self.storage_filesystem.create_dir(self.experiment_fs_path)
+        with self.storage_filesystem.open_output_stream(valid_file):
+            pass
+    def _check_validation_file(self):
+        """Checks that the validation file exists at the storage path."""
+        valid_file = Path(
+            self.experiment_fs_path, VALIDATE_STORAGE_MARKER_FILENAME
+        ).as_posix()
+        if not _exists_at_fs_path(fs=self.storage_filesystem, fs_path=valid_file):
+            raise RuntimeError(
+                f"Unable to set up cluster storage with the following settings:\n{self}"
+                "\nCheck that all nodes in the cluster have read/write access "
+                "to the configured storage path. `RunConfig(storage_path)` should be "
+                "set to a cloud storage URI or a shared filesystem path accessible "
+                "by all nodes in your cluster ('s3://bucket' or '/mnt/nfs'). "
+                "A local path on the head node is not accessible by worker nodes. "
+                "See: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html"  # noqa: E501
+            )
+    def persist_current_checkpoint(
+        self, checkpoint: "Checkpoint", checkpoint_dir_name: str
+    ) -> "Checkpoint":
+        """Persists a given checkpoint to the current checkpoint path on the filesystem.
+        This method copies the checkpoint files to the storage location.
+        It's up to the user to delete the original checkpoint files if desired.
+        For example, the original directory is typically a local temp directory.
+        Args:
+            checkpoint: The checkpoint to persist to
+            (fs, experiment_fs_path / checkpoint_dir_name).
+        Returns:
+            Checkpoint: A Checkpoint pointing to the persisted checkpoint location.
+        """
+        # TODO(justinvyu): Fix this cyclical import.
+        from ray.train import Checkpoint
+        checkpoint_fs_path = self.build_checkpoint_path_from_name(checkpoint_dir_name)
+        logger.debug(
+            "Copying checkpoint files to storage path:\n"
+            "({source_fs}, {source}) -> ({dest_fs}, {destination})".format(
+                source=checkpoint.path,
+                destination=checkpoint_fs_path,
+                source_fs=checkpoint.filesystem,
+                dest_fs=self.storage_filesystem,
+            )
+        )
+        # Raise an error if the storage path is not accessible when
+        # attempting to upload a checkpoint from a remote worker.
+        # Ex: If storage_path is a local path, then a validation marker
+        # will only exist on the head node but not the worker nodes.
+        self._check_validation_file()
+        self.storage_filesystem.create_dir(checkpoint_fs_path)
+        _pyarrow_fs_copy_files(
+            source=checkpoint.path,
+            destination=checkpoint_fs_path,
+            source_filesystem=checkpoint.filesystem,
+            destination_filesystem=self.storage_filesystem,
+        )
+        persisted_checkpoint = Checkpoint(
+            filesystem=self.storage_filesystem,
+            path=checkpoint_fs_path,
+        )
+        logger.info(f"Checkpoint successfully created at: {persisted_checkpoint}")
+        return persisted_checkpoint
+    @property
+    def experiment_fs_path(self) -> str:
+        """The path on the `storage_filesystem` to the experiment directory.
+        NOTE: This does not have a URI prefix anymore, since it has been stripped
+        by pyarrow.fs.FileSystem.from_uri already. The URI scheme information is
+        kept in `storage_filesystem` instead.
+        """
+        return Path(self.storage_fs_path, self.experiment_dir_name).as_posix()
+    @property
+    def local_working_directory(self) -> str:
+        """Every ray train worker will set this directory as its working directory."""
+        if self.experiment_dir_name is None:
+            raise RuntimeError(
+                "Cannot access `local_working_directory` without "
+                "setting `experiment_dir_name`"
+            )
+        return Path(_get_ray_train_session_dir(), self.experiment_dir_name).as_posix()
+    @property
+    def checkpoint_manager_snapshot_path(self) -> str:
+        """The path to the checkpoint manager snapshot file."""
+        return Path(
+            self.experiment_fs_path, CHECKPOINT_MANAGER_SNAPSHOT_FILENAME
+        ).as_posix()
+    @staticmethod
+    def get_experiment_dir_name(run_obj: Union[str, Callable, Type]) -> str:
+        from ray.tune.experiment import Experiment
+        run_identifier = Experiment.get_trainable_name(run_obj)
+        if bool(int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0))):
+            dir_name = run_identifier
+        else:
+            dir_name = "{}_{}".format(run_identifier, date_str())
+        return dir_name
+    @staticmethod
+    def make_default_checkpoint_dir_name():
+        """Get the name of the checkpoint directory by timestamp."""
+        return f"checkpoint_{date_str(include_ms=True)}"
+    def extract_checkpoint_dir_name_from_path(self, checkpoint_path: str) -> str:
+        """Get the checkpoint name from the checkpoint path.
+        The parent directory of the checkpoint path should be the experiment directory.
+        """
+        # TODO: Use Pathlib to extract the name when supports at least Python 3.9
+        experiment_fs_path = self.experiment_fs_path + "/"
+        if not checkpoint_path.startswith(experiment_fs_path):
+            raise ValueError(
+                f"Checkpoint path {checkpoint_path} is not under the experiment "
+                f"directory {self.experiment_fs_path}."
+            )
+        return checkpoint_path[len(experiment_fs_path) :]
+    def build_checkpoint_path_from_name(self, checkpoint_name: str) -> str:
+        """Get the checkpoint path from the checkpoint name.
+        The parent directory of the checkpoint path should be the experiment directory.
+        """
+        return Path(self.experiment_fs_path, checkpoint_name).as_posix()