diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd360bbbd9014c6a3725a7bc085bd9d870dfa544
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8239f824d784f69fda7d5a332750598d90befe31
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1463a8c55eda528286615c67325fa82dd8f0aecd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b35644e15d8d874cc0393d4bc42902ea9606c90
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29f1d86f8f67ed628149614cffb9e7522a6eb3dc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b32970b7e438613b2ad8c983c8c14f8b18e2470
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..170258dbcfc06e7e14b1825723e14c08e1faf95e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9786f430329549d9fd807a84b3197f1d6f520b3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75e9c11e701f09bcec7369446c722b0fefb48159
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c542edf29626491786a1b2e29e59d9bb85d56746
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8306b67685612dc1585cc592646ac9eed2a70343
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c9d7ff7993c9e50477ba2b62c1b40b8fb717bff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f591ad7a5479bb1c814035f69cc927a14ed25b2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7c622f384ade5cfd4be3fde19dcf593463c21e1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1eedc278c3115280a695c0aaac366b71970b1a47
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3fe351c87c8c09802a3e8bc0aa268ee9c06a2b0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf6f8d984ad927871db737a4dfe5b559ccf3322a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70c0add20ebf5e38c2553ffc53a3cf6313f9b1fb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..792797bbc8108698e9e49270a502c2118f279e98
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ee8257dd94b3dc28faef502583476594a64e580
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cd18f1a44f5ed77bcf7e14043100077aa00c72b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b8290e4aba5ff4f50d02b618e5c4447e97670cc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c31098d86defc704ddef75172543813361119da
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py
@@ -0,0 +1,5 @@
+import abc
+
+
+class Accelerator(abc.ABC):
+    """A utility that contains methods to accelerate training."""
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3815f31add4097ce2e64f1c7e03f6385bb38ee7a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py
@@ -0,0 +1,830 @@
+import logging
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar
+
+import ray
+import ray._private.ray_constants as ray_constants
+from ray._private.ray_constants import env_integer
+from ray.data import Dataset
+from ray.exceptions import RayActorError
+from ray.train import Checkpoint, DataConfig
+from ray.train._internal.session import (
+    TrialInfo,
+    _TrainingResult,
+    get_session,
+    init_session,
+    shutdown_session,
+)
+from ray.train._internal.storage import StorageContext
+from ray.train._internal.utils import check_for_failure
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import BackendConfig
+from ray.train.constants import (
+    ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
+    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
+    ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
+    ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV,
+    ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV,
+    RAY_TRAIN_ENABLE_STATE_TRACKING,
+    TRAIN_ENABLE_WORKER_SPREAD_ENV,
+    TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
+)
+from ray.util.placement_group import get_current_placement_group, remove_placement_group
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+class TrainBackendError(Exception):
+    """Errors with BackendExecutor that should not be exposed to user."""
+
+
+class TrainingWorkerError(Exception):
+    """Raised if a worker fails during training."""
+
+
+@dataclass
+class ResourceConfig:
+    """
+    Resource configuration for resource_ids to share between workers.
+
+    Args:
+        resource_name: The name of the resource to configure
+         (Example: "neuron_cores" or "gpu").
+        resource_enable_sharing_env_var: The environment variable to
+         check if the resource should be shared.
+        share_resource_ids_env_var: The environment variable to configure for
+         sharing the resources with other workers.
+    """
+
+    resource_name: str
+    resource_enable_sharing_env_var: str
+    share_resource_ids_env_var: str
+
+
+class BackendExecutor:
+    """Main execution class for training backends.
+
+    This class holds a worker group and is responsible for executing the
+    training function on the workers, and collecting intermediate results
+    from ``session.report()``.
+
+    Args:
+        backend_config: The configurations for this
+            specific backend.
+        num_workers: Number of workers to use for training.
+        resources_per_worker (Optional[Dict[str, float]]):
+            Dictionary specifying the resources that will be
+            requested for each worker. Defaults to {"CPU": 1}.
+        max_retries: Number of retries when Ray actors fail.
+            Defaults to 3. Set to -1 for unlimited retries.
+    """
+
+    def __init__(
+        self,
+        backend_config: BackendConfig,
+        # TODO(xwjiang): Legacy Ray Train trainer clean up!
+        trial_info: Optional[TrialInfo] = None,
+        num_workers: int = 1,
+        resources_per_worker: Optional[Dict[str, float]] = None,
+        max_retries: int = 3,
+    ):
+        if resources_per_worker is None:
+            self._resources_per_worker = {"CPU": 1}
+        else:
+            self._resources_per_worker = resources_per_worker.copy()
+
+        self._backend_config = backend_config
+        self._backend = backend_config.backend_cls()
+        self._num_workers = num_workers
+        self._max_failures = max_retries
+        if self._max_failures < 0:
+            self._max_failures = float("inf")
+        self._num_failures = 0
+        self._last_failure = None
+        self._initialization_hook = None
+        self._placement_group = None
+
+        self._trial_info = trial_info
+
+        self.worker_group = InactiveWorkerGroup()
+        self.dataset_shards = None
+
+        self._resource_configs = [
+            ResourceConfig(
+                ray_constants.NEURON_CORES,
+                ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
+                ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR,
+            ),
+            ResourceConfig(
+                ray_constants.NPU,
+                ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV,
+                ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR,
+            ),
+            # For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var.
+            ResourceConfig(
+                ray_constants.GPU,
+                ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV,
+                ray_constants.ROCR_VISIBLE_DEVICES_ENV_VAR,
+            ),
+        ]
+
+        # Record the initialization time of BackendExecutor, which is
+        # after trainer.fit() and before worker_group executes the training function.
+        self._start_time_ms = int(time.time() * 1000)
+
+        self.state_tracking_enabled = env_integer(RAY_TRAIN_ENABLE_STATE_TRACKING, 0)
+
+    def start(
+        self,
+        initialization_hook: Optional[Callable[[], None]] = None,
+        train_cls: Optional[Type] = None,
+        train_cls_args: Optional[Tuple] = None,
+        train_cls_kwargs: Optional[Dict] = None,
+    ):
+        """Starts the worker group."""
+        self._create_placement_group()
+        placement_group = self._placement_group or "default"
+        self.worker_group = WorkerGroup(
+            num_workers=self._num_workers,
+            resources_per_worker=self._resources_per_worker,
+            actor_cls=train_cls,
+            actor_cls_args=train_cls_args,
+            actor_cls_kwargs=train_cls_kwargs,
+            placement_group=placement_group,
+        )
+        # Hack to avoid OOMs.
+        # This is just a temporary solution for Train loading entire checkpoints
+        # into memory by ensuring that the rank 0 worker is on the same node as
+        # trainable, thus allowing for lazy checkpoint transfer to be used.
+        # See https://github.com/ray-project/ray/issues/33073
+        # for more context.
+        # TODO remove passing in trial_driver_ip.
+
+        trial_driver_node_id = (
+            self._trial_info.driver_node_id if self._trial_info else None
+        )
+        self.worker_group.sort_workers_by_node_id_and_gpu_id(trial_driver_node_id)
+
+        try:
+            if initialization_hook:
+                self._initialization_hook = initialization_hook
+                self.worker_group.execute(initialization_hook)
+
+            # Always propagate the driver's DataContext to each worker in the group.
+            from ray.data import DataContext
+
+            def _set_driver_dataset_context(ctx: DataContext):
+                DataContext._set_current(ctx)
+
+            self.worker_group.execute(
+                _set_driver_dataset_context,
+                DataContext.get_current(),
+            )
+
+            share_cuda_visible_devices_enabled = bool(
+                env_integer(
+                    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
+                    self._backend.share_cuda_visible_devices,
+                )
+            )
+
+            if (
+                self._resources_per_worker.get("GPU", 0) > 0
+                and share_cuda_visible_devices_enabled
+            ):
+                self._share_cuda_visible_devices()
+            for resource_config in self._resource_configs:
+                if self._is_share_resources_enabled(
+                    resource_config.resource_name,
+                    resource_config.resource_enable_sharing_env_var,
+                ):
+                    self._share_resource_ids(
+                        resource_config.resource_name,
+                        resource_config.share_resource_ids_env_var,
+                    )
+            self._backend.on_start(self.worker_group, self._backend_config)
+        except RayActorError as exc:
+            logger.exception(str(exc))
+            logger.warning(
+                "Failure occurred during startup. Restarting all workers and "
+                "attempting to startup again."
+            )
+            self._increment_failures()
+            self._restart()
+
+        if self.state_tracking_enabled:
+            from ray.train._internal.state import TrainRunStateManager
+            from ray.train._internal.state.state_actor import get_state_actor
+
+            self.state_manager = TrainRunStateManager(state_actor=get_state_actor())
+
+    def _create_placement_group(self):
+        """Creates a placement group if it does not exist.
+
+        If a placement group is already detected (Tune) this will be a no-op.
+
+        By default the placement group will be created with PACK strategy.
+        This is optimized for colocating GPUs on a minimal number of nodes.
+        This behavior can be overridden to use the SPREAD strategy by defining
+        ``TRAIN_ENABLE_WORKER_SPREAD_ENV``
+
+        If a placement group is created it will be stored as
+        self._placement_group.
+        """
+        current_placement_group = get_current_placement_group()
+        worker = ray._private.worker.global_worker
+        should_capture_child_tasks_in_placement_group = (
+            worker.should_capture_child_tasks_in_placement_group
+        )
+        should_create_placement_group = (
+            current_placement_group is None
+            or not should_capture_child_tasks_in_placement_group
+        )
+
+        if should_create_placement_group:
+            bundles = [
+                self._resources_per_worker.copy() for _ in range(self._num_workers)
+            ]
+
+            use_spread = bool(env_integer(TRAIN_ENABLE_WORKER_SPREAD_ENV, 0))
+            strategy = "SPREAD" if use_spread else "PACK"
+
+            placement_group = ray.util.placement_group(bundles, strategy=strategy)
+            logger.debug("Waiting for placement group to start.")
+            timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100)
+            ready, _ = ray.wait([placement_group.ready()], timeout=timeout)
+            if ready:
+                logger.debug("Placement group has started.")
+            else:
+                raise TimeoutError(
+                    "Placement group creation timed out. Make sure your "
+                    "cluster either has enough resources or use an "
+                    "autoscaling cluster. If you are running on a cluster, "
+                    "make sure you specify an address in `ray.init()`, for example, "
+                    '`ray.init("auto")`. You can also increase the timeout by setting '
+                    "the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. "
+                    "Current resources available: {}, resources requested by the "
+                    "placement group: {}".format(
+                        ray.available_resources(), placement_group.bundle_specs
+                    )
+                )
+            self._placement_group = placement_group
+
+    def _share_cuda_visible_devices(self):
+        """Sets CUDA_VISIBLE_DEVICES on all workers.
+
+        For each worker, CUDA_VISIBLE_DEVICES will be set to the GPU IDs
+        visible to all workers on that worker's node.
+
+        This allows GPU workers on the same node to communicate with one
+        another.
+
+        Example:
+
+            Setup:
+            - Node1:
+                - Worker1: {0, 1}
+                - Worker2: {2, 3}
+            - Node2:
+                - Worker3: {0, 1}
+
+            CUDA_VISIBLE_DEVICES:
+            - Worker1: "0,1,2,3"
+            - Worker2: "0,1,2,3"
+            - Worker3: "0,1"
+
+        """
+        self._share_resource_ids(
+            ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR
+        )
+
+    def _share_resource_ids(self, resource: str, env_var: str):
+        """Sets the given env_var on all workers.
+
+        For each worker, the cores/devices are visible to all the
+        workers on that worker's node.This allows workers on the
+        same node to communicate with one another.
+
+        Example:
+
+            Setup:
+            - Node1:
+                - Worker1: {0, 1}
+                - Worker2: {2, 3}
+            - Node2:
+                - Worker3: {0, 1}
+
+            NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/...:
+            - Worker1: "0,1,2,3"
+            - Worker2: "0,1,2,3"
+            - Worker2: "0,1"
+
+        Args:
+            resource: The name of the resource/accelerator.
+            env_var: The name of the environment variable to set.
+        """
+        node_ids_and_resource_ids = [
+            (
+                w.metadata.node_id,
+                w.metadata.resource_ids[resource],
+            )
+            for w in self.worker_group.workers
+        ]
+        node_id_to_worker_id = defaultdict(set)
+        node_id_to_resource_ids = defaultdict(set)
+
+        for worker_id, (node_id, resource_ids) in enumerate(node_ids_and_resource_ids):
+            node_id_to_worker_id[node_id].add(worker_id)
+            node_id_to_resource_ids[node_id].update(resource_ids)
+
+        futures = []
+        for node_id, resource_ids in node_id_to_resource_ids.items():
+            resource_ids = sorted(resource_ids)
+            all_resource_ids = ",".join(resource_ids)
+
+            def set_resource_ids():
+                os.environ[env_var] = all_resource_ids
+
+            for worker_id in node_id_to_worker_id[node_id]:
+                futures.append(
+                    self.worker_group.execute_single_async(worker_id, set_resource_ids)
+                )
+        ray.get(futures)
+
+    def _is_share_resources_enabled(self, resource_name: str, enable_sharing_env: str):
+        """Whether to share resource IDs on all workers
+        based on enable_sharing_env.
+
+        This will return true if resources are requested and greater than 0.
+        Also, user can disable by configuring the `enable_sharing_env` to "0".
+
+        Args:
+            resource_name: The name of the resource/accelerator.
+            enable_sharing_env: The name of the environment variable
+                to check.
+        """
+        has_resource_requested = self._resources_per_worker.get(resource_name, 0) > 0
+        return has_resource_requested and ray_constants.env_bool(
+            enable_sharing_env, True
+        )
+
+    def _create_rank_world_size_mappings(self) -> List[Dict]:
+        """Create rank and world size mappings for workers.
+        There are three maps returned:
+            - local_rank_map, which maps from worker world_rank to local_rank.
+            - local_world_size_map, which maps from world_rank to local_world_size
+            - node_rank_map, which maps from world rank to node rank
+
+        Example:
+            Worker 0: node 0
+            Worker 1: node 0
+            Worker 2: node 1
+            Worker 3: node 0
+            Worker 4: node 1
+
+            Workers 0, 1, 3 are on node 0.
+            Workers 2, 4 are on node 1.
+
+            Expected local_rank_map:
+            {
+                0 -> 0,
+                1 -> 1,
+                2 -> 0,
+                3 -> 2,
+                4 -> 1
+            }
+
+            Expected local_world_size_map:
+            {
+                0 -> 3,
+                1 -> 3,
+                2 -> 2,
+                3 -> 3,
+                4 -> 2
+            }
+
+            Expected node_rank_map:
+            {
+                0 -> 0,
+                1 -> 0,
+                2 -> 1,
+                3 -> 0,
+                4 -> 1
+            }
+
+        """
+        local_rank_map = {}  # map from world rank to local rank
+        local_world_size_map = {}  # map from world rank to local world size
+        node_rank_map = {}  # map from world rank to node rank
+        node_ids = {}  # map from node id to node index
+        node_cnt = 0  # count the number of nodes
+
+        node_id_dict = defaultdict(
+            int
+        )  # map from node id to the number of workers on it.
+        for world_rank in range(len(self.worker_group)):
+            worker = self.worker_group.workers[world_rank]
+            node_id = worker.metadata.node_id
+            local_rank_map[world_rank] = node_id_dict[node_id]
+            node_id_dict[node_id] += 1
+
+            if node_id not in node_ids:
+                node_ids[node_id] = node_cnt
+                node_cnt += 1
+            node_rank_map[world_rank] = node_ids[node_id]
+
+        for world_rank in range(len(self.worker_group)):
+            worker = self.worker_group.workers[world_rank]
+            node_id = worker.metadata.node_id
+            local_world_size_map[world_rank] = node_id_dict[node_id]
+
+        workers_info = "\n".join(
+            [
+                f"- (node_id={w.metadata.node_id}, ip={w.metadata.node_ip}, "
+                f"pid={w.metadata.pid}) world_rank={i}, "
+                f"local_rank={local_rank_map[i]}, node_rank={node_rank_map[i]}"
+                for i, w in enumerate(self.worker_group.workers)
+            ]
+        )
+        logger.info(f"Started distributed worker processes: \n{workers_info}")
+
+        return local_rank_map, local_world_size_map, node_rank_map
+
+    def start_training(
+        self,
+        train_func: Callable[[], T],
+        datasets: Dict[str, Dataset],
+        metadata: Dict[str, Any],
+        data_config: DataConfig,
+        storage: StorageContext,
+        checkpoint: Optional[Checkpoint] = None,
+    ) -> None:
+        """Executes a training function on all workers in a separate thread.
+
+        ``finish_training`` should be called after this.
+
+        Args:
+            train_func: The training function to run on each worker.
+            datasets: The base datasets.
+            data_config: The config object for creating dataset shards for workers.
+            checkpoint: The checkpoint data that
+                should be loaded onto each worker and accessed by the
+                training function via ``session.get_checkpoint()``. If this
+                is ``None`` then no checkpoint will be loaded.
+        """
+        use_detailed_autofilled_metrics = env_integer(
+            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0
+        )
+
+        # First initialize the session.
+        def initialize_session(
+            train_func,
+            world_rank,
+            local_rank,
+            node_rank,
+            local_world_size,
+            world_size,
+            trial_info,
+            checkpoint,
+            dataset_shard,
+            metadata,
+            storage,
+        ):
+            try:
+                init_session(
+                    training_func=train_func,
+                    world_rank=world_rank,
+                    local_rank=local_rank,
+                    node_rank=node_rank,
+                    local_world_size=local_world_size,
+                    world_size=world_size,
+                    trial_info=trial_info,
+                    dataset_shard=dataset_shard,
+                    metadata=metadata,
+                    checkpoint=checkpoint,
+                    detailed_autofilled_metrics=use_detailed_autofilled_metrics,
+                    storage=storage,
+                )
+            except ValueError:
+                raise TrainBackendError(
+                    "Attempting to start training but a "
+                    "previous training run is still ongoing. "
+                    "You must call `finish_training` before "
+                    "calling `start_training` again."
+                )
+
+        if self.dataset_shards is None:
+            actors = [worker.actor for worker in self.worker_group.workers]
+            node_ids = [worker.metadata.node_id for worker in self.worker_group.workers]
+            self.dataset_shards = data_config.configure(
+                datasets,
+                world_size=len(self.worker_group),
+                worker_handles=actors,
+                worker_node_ids=node_ids,
+            )
+
+        (
+            local_rank_map,
+            local_world_size_map,
+            node_rank_map,
+        ) = self._create_rank_world_size_mappings()
+
+        futures = []
+        for index in range(len(self.worker_group)):
+            futures.append(
+                self.worker_group.execute_single_async(
+                    index,
+                    initialize_session,
+                    world_rank=index,
+                    local_rank=local_rank_map[index],
+                    node_rank=node_rank_map[index],
+                    local_world_size=local_world_size_map[index],
+                    world_size=len(self.worker_group),
+                    trial_info=self._trial_info,
+                    train_func=train_func,
+                    dataset_shard=self.dataset_shards[index],
+                    metadata=metadata,
+                    checkpoint=checkpoint,
+                    storage=storage,
+                )
+            )
+
+        self._backend.on_training_start(self.worker_group, self._backend_config)
+
+        self.get_with_failure_handling(futures)
+
+        # Register Train Run before training starts
+        if self.state_tracking_enabled:
+            from ray.train._internal.state.schema import RunStatusEnum
+
+            core_context = ray.runtime_context.get_runtime_context()
+
+            self.state_manager.register_train_run(
+                run_id=self._trial_info.run_id,
+                run_name=self._trial_info.experiment_name,
+                job_id=core_context.get_job_id(),
+                controller_actor_id=core_context.get_actor_id(),
+                datasets=datasets,
+                worker_group=self.worker_group,
+                start_time_ms=self._start_time_ms,
+                run_status=RunStatusEnum.RUNNING,
+            )
+
+        # Run the training function asynchronously in its own thread.
+        def train_async():
+            session = get_session()
+            session.start()
+
+        self.worker_group.execute_async(train_async)
+
+    def get_next_results(self) -> Optional[List[_TrainingResult]]:
+        """Fetches the next ``_TrainingResult`` from each worker.
+
+        Each ``_TrainingResult`` is expected to correspond to the same step from
+        each worker (e.g. the same call to ``train.report()``).
+
+        Returns:
+            A list of ``_TrainingResult``s or ``None`` if there are no more results
+            since the training function has exited on all workers.
+        """
+
+        def get_next():
+            session = _get_session("get_next_results")
+            try:
+                result = session.get_next()
+            except RuntimeError:
+                # Training thread has not been started yet.
+                raise TrainBackendError(
+                    "`get_next_results` has been called "
+                    "before `start_training`. Please call "
+                    "`start_training` before "
+                    "`get_next_results`."
+                )
+
+            return result
+
+        # Get next result from each worker.
+        futures = self.worker_group.execute_async(get_next)
+        results = self.get_with_failure_handling(futures)
+
+        # Check if any worker returned None.
+        if any(r is None for r in results):
+            # Either all workers have results or none of them do.
+            if not all(r is None for r in results):
+                raise RuntimeError(
+                    "Some workers returned results while "
+                    "others didn't. Make sure that "
+                    "`session.report()` are called the "
+                    "same number of times on all workers."
+                )
+            else:
+                # Return None if all results are None.
+                return None
+
+        return results
+
+    def pause_reporting(self):
+        """Disable workers from enqueuing results from ``session.report()``.
+
+        Note: Already reported results may still be enqueued at this point,
+              and should be handled appropriately.
+        """
+
+        def pause_session_reporting():
+            session = _get_session("pause_reporting")
+            return session.pause_reporting()
+
+        futures = self.worker_group.execute_async(pause_session_reporting)
+        self.get_with_failure_handling(futures)
+
+    def finish_training(self):
+        """Finish training and return final results. Propagate any exceptions.
+
+        Blocks until training is finished on all workers.
+
+        Assumes `start_training` has already been called.
+
+        Returns:
+            A list of return values from calling ``train_func`` on each worker.
+                Each item corresponds to the return value from a single worker.
+        """
+
+        def end_training():
+            session = _get_session("finish_training")
+            try:
+                # session.finish raises any Exceptions from training.
+                output = session.finish()
+            finally:
+                # Shutdown session even if session.finish() raises an
+                # Exception.
+                shutdown_session()
+
+            return output
+
+        futures = self.worker_group.execute_async(end_training)
+        results = self.get_with_failure_handling(futures)
+        return results
+
+    def report_final_run_status(
+        self,
+        errored: bool = False,
+        failed_rank: Optional[int] = None,
+        stack_trace: Optional[str] = None,
+    ):
+        """Report the final train run status, error, and end time to TrainStateActor."""
+        if self.state_tracking_enabled:
+            from ray.train._internal.state.schema import (
+                MAX_ERROR_STACK_TRACE_LENGTH,
+                RunStatusEnum,
+            )
+
+            if errored:
+                run_status = RunStatusEnum.ERRORED
+                status_detail = ""
+                if failed_rank is not None:
+                    status_detail += f"Rank {failed_rank} worker raised an error. \n"
+                if stack_trace is not None:
+                    # Keep only the last part of the stack trace if it's too long.
+                    status_detail += stack_trace[-MAX_ERROR_STACK_TRACE_LENGTH:]
+            else:
+                run_status = RunStatusEnum.FINISHED
+                status_detail = ""
+
+            self.state_manager.end_train_run(
+                run_id=self._trial_info.run_id,
+                run_status=run_status,
+                status_detail=status_detail,
+                end_time_ms=int(time.time() * 1000),
+            )
+
+    def get_with_failure_handling(self, remote_values):
+        """Gets the remote values while handling for worker failures.
+
+        This method should be called instead of ``ray.get()`` directly in
+        order to handle worker failures.
+
+        If a worker failure is identified, backend specific failure handling
+        is executed and a ``TrainingWorkerError`` is raised.
+
+        Args:
+            remote_values: List of object refs representing functions
+                that may fail in the middle of execution. For example, running
+                a Train training loop in multiple parallel actor calls.
+        Returns:
+            The resolved objects represented by the passed in ObjectRefs.
+        """
+        success, exception = check_for_failure(remote_values)
+        if success:
+            return ray.get(remote_values)
+        else:
+            self._last_failure = exception
+            self._increment_failures()
+            logger.warning(
+                "Failure identified during training. Restarting all workers and "
+                "continuing training from latest checkpoint."
+            )
+            self._restart()
+            raise TrainingWorkerError
+
+    def shutdown(self, graceful_termination: bool = True):
+        """Shuts down the workers in the worker group.
+
+        Args:
+            graceful_termination: If set to True, attempt to clean up the backend
+                before terminating the Ray actors.
+
+        """
+        if graceful_termination:
+            try:
+                self._backend.on_shutdown(self.worker_group, self._backend_config)
+            except RayActorError:
+                logger.warning(
+                    "Graceful shutdown of backend failed. This is "
+                    "expected if one of the workers has crashed."
+                )
+
+        if graceful_termination:
+            self.worker_group.shutdown()
+        else:
+            self.worker_group.shutdown(patience_s=0)
+        self.worker_group = InactiveWorkerGroup()
+
+        if self._placement_group:
+            remove_placement_group(self._placement_group)
+            self._placement_group = None
+
+        self.dataset_shards = None
+
+    def is_started(self):
+        return not isinstance(self.worker_group, InactiveWorkerGroup)
+
+    def _restart(self):
+        self.worker_group.shutdown()
+        if self._initialization_hook is not None:
+            initialization_hook = self._initialization_hook
+        else:
+            initialization_hook = None
+        if self._placement_group:
+            remove_placement_group(self._placement_group)
+            self._placement_group = None
+        self.start(initialization_hook=initialization_hook)
+
+    def _increment_failures(self):
+        self._num_failures += 1
+        if self._num_failures >= self._max_failures:
+            failure = self._last_failure
+            self._last_failure = None
+            if self._max_failures > 0:
+                exc = RuntimeError(
+                    "Training has failed after " f"{self._num_failures} " "attempts."
+                )
+                raise exc.with_traceback(None) from failure
+            else:
+                raise failure
+
+    def get_worker_group(self):
+        return self.worker_group
+
+    def _get_num_failures(self):
+        return self._num_failures
+
+
+class InactiveWorkerGroupError(Exception):
+    """Raised when underlying worker group is inactive."""
+
+
+class InactiveWorkerGroup:
+    # TODO: fix inheritence. perhaps create WorkerGroupInterface.
+
+    # Need to define getstate and setstate so that getattr does not screwup
+    # pickling. See https://stackoverflow.com/a/50888571/11249691
+    def __getstate__(self):
+        return vars(self)
+
+    def __setstate__(self, state):
+        vars(self).update(state)
+
+    def __getattr__(self, name):
+        raise InactiveWorkerGroupError()
+
+    def __len__(self):
+        raise InactiveWorkerGroupError()
+
+
+def _get_session(method_name: str):
+    # Get the session for this worker.
+    session = get_session()
+    if not session:
+        # Session is not initialized yet.
+        raise TrainBackendError(
+            f"`{method_name}` has been called "
+            "before `start_training`. Please call "
+            "`start_training` before "
+            f"`{method_name}`."
+        )
+    return session
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0eb55275246b5dcc7cff0c6fda58f2318b5076
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py
@@ -0,0 +1,185 @@
+import logging
+import numbers
+from typing import Any, Callable, List, Optional, Tuple
+
+from ray._private.dict import flatten_dict
+from ray.air._internal.util import is_nan
+from ray.air.config import MAX
+from ray.train import CheckpointConfig
+from ray.train._internal.session import _TrainingResult
+from ray.train._internal.storage import _delete_fs_path
+
+logger = logging.getLogger(__name__)
+
+
+def _insert_into_sorted_list(list: List[Any], item: Any, key: Callable[[Any], Any]):
+    """Insert an item into a sorted list with a custom key function.
+
+    Examples:
+
+        >>> list = []
+        >>> _insert_into_sorted_list(list, {"a": 1, "b": 0}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}]
+        >>> _insert_into_sorted_list(list, {"a": 3, "b": 1}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}, {'a': 3, 'b': 1}]
+        >>> _insert_into_sorted_list(list, {"a": 4, "b": 2}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}, {'a': 3, 'b': 1}, {'a': 4, 'b': 2}]
+        >>> _insert_into_sorted_list(list, {"a": 1, "b": 3}, lambda x: x["a"])
+        >>> list
+        [{'a': 1, 'b': 0}, {'a': 1, 'b': 3}, {'a': 3, 'b': 1}, {'a': 4, 'b': 2}]
+    """
+    i = 0
+    while i < len(list):
+        # Insert to the right of all duplicates.
+        if key(list[i]) > key(item):
+            break
+        i += 1
+    list.insert(i, item)
+
+
+class _CheckpointManager:
+    """Checkpoint manager that handles checkpoint book-keeping for a trial.
+
+    The main purpose of this abstraction is to keep the top K checkpoints based on
+    recency/a user-provided metric.
+
+    NOTE: This class interacts with `_TrainingResult` objects, which are
+    (checkpoint, metrics) pairs. This is to order checkpoints by metrics.
+
+    Args:
+        checkpoint_config: Defines how many and which checkpoints to keep.
+    """
+
+    def __init__(self, checkpoint_config: Optional[CheckpointConfig]):
+        self._checkpoint_config = checkpoint_config or CheckpointConfig()
+
+        # List of checkpoints ordered by ascending score.
+        self._checkpoint_results: List[_TrainingResult] = []
+
+        # The latest registered checkpoint.
+        # This should never be immediately deleted upon registration,
+        # even if it's not in the top K checkpoints, based on score.
+        self._latest_checkpoint_result: Optional[_TrainingResult] = None
+
+        if (
+            self._checkpoint_config.num_to_keep is not None
+            and self._checkpoint_config.num_to_keep <= 0
+        ):
+            raise ValueError(
+                f"`num_to_keep` must >= 1, got: "
+                f"{self._checkpoint_config.num_to_keep}"
+            )
+
+    @property
+    def checkpoint_config(self):
+        return self._checkpoint_config
+
+    def register_checkpoint(self, checkpoint_result: _TrainingResult):
+        """Register new checkpoint and add to bookkeeping.
+
+        This method will register a new checkpoint and add it to the internal
+        bookkeeping logic. This means the checkpoint manager will decide if
+        this checkpoint should be kept, and if older or worse performing
+        checkpoints should be deleted.
+
+        Args:
+            checkpoint: Tracked checkpoint object to add to bookkeeping.
+        """
+        self._latest_checkpoint_result = checkpoint_result
+
+        if self._checkpoint_config.checkpoint_score_attribute is not None:
+            # If we're ordering by a score, insert the checkpoint
+            # so that the list remains sorted.
+            _insert_into_sorted_list(
+                self._checkpoint_results,
+                checkpoint_result,
+                key=self._get_checkpoint_score,
+            )
+        else:
+            # If no metric is provided, just append (ordering by time of registration).
+            self._checkpoint_results.append(checkpoint_result)
+
+        if self._checkpoint_config.num_to_keep is not None:
+            # Delete the bottom (N - K) checkpoints
+            worst_results = set(
+                self._checkpoint_results[: -self._checkpoint_config.num_to_keep]
+            )
+            # Except for the latest checkpoint.
+            results_to_delete = worst_results - {self._latest_checkpoint_result}
+
+            # Update internal state before actually deleting them.
+            self._checkpoint_results = [
+                checkpoint_result
+                for checkpoint_result in self._checkpoint_results
+                if checkpoint_result not in results_to_delete
+            ]
+
+            for checkpoint_result in results_to_delete:
+                checkpoint = checkpoint_result.checkpoint
+                logger.debug("Deleting checkpoint: ", checkpoint)
+                _delete_fs_path(fs=checkpoint.filesystem, fs_path=checkpoint.path)
+
+    def _get_checkpoint_score(
+        self, checkpoint: _TrainingResult
+    ) -> Tuple[bool, numbers.Number]:
+        """Get the score for a checkpoint, according to checkpoint config.
+
+        If `mode="min"`, the metric is negated so that the lowest score is
+        treated as the best.
+
+        Returns:
+            Tuple: A tuple of (not_is_nan: bool, score: numbers.Number).
+                This score orders: nan values < float("-inf") < valid numeric metrics
+        """
+        checkpoint_score_attribute = self._checkpoint_config.checkpoint_score_attribute
+        if checkpoint_score_attribute:
+            flat_metrics = flatten_dict(checkpoint.metrics)
+            try:
+                checkpoint_result = flat_metrics[checkpoint_score_attribute]
+            except KeyError:
+                valid_keys = list(flat_metrics.keys())
+                logger.error(
+                    f"Result dict has no key: {checkpoint_score_attribute}. "
+                    f"checkpoint_score_attr must be set to a key in the "
+                    f"result dict. Valid keys are: {valid_keys}"
+                )
+                checkpoint_result = float("-inf")
+        else:
+            checkpoint_result = float("-inf")
+
+        checkpoint_score_order = self._checkpoint_config.checkpoint_score_order
+        order_factor = 1.0 if checkpoint_score_order == MAX else -1.0
+
+        checkpoint_score = order_factor * checkpoint_result
+
+        if not isinstance(checkpoint_score, numbers.Number):
+            raise ValueError(
+                f"Unable to persist checkpoint for "
+                f"checkpoint_score_attribute: "
+                f"{checkpoint_score_attribute} with value "
+                f"{checkpoint_score}. "
+                f"This attribute must be numerical."
+            )
+
+        return (
+            (not is_nan(checkpoint_score), checkpoint_score)
+            if not is_nan(checkpoint_score)
+            else (False, float("-inf"))
+        )
+
+    @property
+    def best_checkpoint_result(self) -> Optional[_TrainingResult]:
+        return self._checkpoint_results[-1] if self._checkpoint_results else None
+
+    @property
+    def latest_checkpoint_result(self) -> Optional[_TrainingResult]:
+        return self._latest_checkpoint_result
+
+    @property
+    def best_checkpoint_results(self) -> List[_TrainingResult]:
+        if self._checkpoint_config.num_to_keep is None:
+            return self._checkpoint_results
+        return self._checkpoint_results[-self._checkpoint_config.num_to_keep :]
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ec9addb3bf2027e49b8cf08b49e4a7b40d5451
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py
@@ -0,0 +1,139 @@
+import copy
+from typing import Dict, List, Literal, Optional, Union
+
+import ray
+from ray.actor import ActorHandle
+from ray.data import DataIterator, Dataset, ExecutionOptions, NodeIdStr
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+from ray.util.annotations import DeveloperAPI, PublicAPI
+
+
+@PublicAPI(stability="stable")
+class DataConfig:
+    """Class responsible for configuring Train dataset preprocessing.
+
+    For advanced use cases, this class can be subclassed and the `configure()` method
+    overriden for custom data preprocessing.
+    """
+
+    def __init__(
+        self,
+        datasets_to_split: Union[Literal["all"], List[str]] = "all",
+        execution_options: Optional[ExecutionOptions] = None,
+    ):
+        """Construct a DataConfig.
+
+        Args:
+            datasets_to_split: Specifies which datasets should be split among workers.
+                Can be set to "all" or a list of dataset names. Defaults to "all",
+                i.e. split all datasets.
+            execution_options: The execution options to pass to Ray Data. By default,
+                the options will be optimized for data ingest. When overriding this,
+                base your options off of `DataConfig.default_ingest_options()`.
+        """
+        if isinstance(datasets_to_split, list) or datasets_to_split == "all":
+            self._datasets_to_split = datasets_to_split
+        else:
+            raise TypeError(
+                "`datasets_to_split` should be a 'all' or a list of strings of "
+                "dataset names. Received "
+                f"{type(datasets_to_split).__name__} with value {datasets_to_split}."
+            )
+
+        self._execution_options: ExecutionOptions = (
+            execution_options or DataConfig.default_ingest_options()
+        )
+
+        self._num_train_cpus = 0.0
+        self._num_train_gpus = 0.0
+
+    def set_train_total_resources(self, num_train_cpus: float, num_train_gpus: float):
+        """Set the total number of CPUs and GPUs used by training.
+
+        If CPU or GPU resource limits are not set, they will be set to the
+        total cluster resources minus the resources used by training.
+        """
+        # TODO: We may also include other resources besides CPU and GPU.
+        self._num_train_cpus = num_train_cpus
+        self._num_train_gpus = num_train_gpus
+
+    @DeveloperAPI
+    def configure(
+        self,
+        datasets: Dict[str, Dataset],
+        world_size: int,
+        worker_handles: Optional[List[ActorHandle]],
+        worker_node_ids: Optional[List[NodeIdStr]],
+        **kwargs,
+    ) -> List[Dict[str, DataIterator]]:
+        """Configure how Train datasets should be assigned to workers.
+
+        Args:
+            datasets: The datasets dict passed to Train by the user.
+            world_size: The number of Train workers in total.
+            worker_handles: The actor handles of the Train workers.
+            worker_node_ids: The node ids of the Train workers.
+            kwargs: Forwards compatibility placeholder.
+
+        Returns:
+            A list of dataset splits for each worker. The size of the list must be
+            equal to `world_size`. Each element of the list contains the assigned
+            `DataIterator` instances by name for the worker.
+        """
+        output = [{} for _ in range(world_size)]
+
+        if self._datasets_to_split == "all":
+            datasets_to_split = set(datasets.keys())
+        else:
+            datasets_to_split = set(self._datasets_to_split)
+
+        locality_hints = (
+            worker_node_ids if self._execution_options.locality_with_output else None
+        )
+        for name, ds in datasets.items():
+            execution_options = copy.deepcopy(self._execution_options)
+
+            if execution_options.is_resource_limits_default():
+                # If "resource_limits" is not overriden by the user,
+                # add training-reserved resources to Data's exclude_resources.
+                execution_options.exclude_resources = (
+                    execution_options.exclude_resources.add(
+                        ExecutionResources(
+                            cpu=self._num_train_cpus, gpu=self._num_train_gpus
+                        )
+                    )
+                )
+
+            ds = ds.copy(ds)
+            ds.context.execution_options = execution_options
+
+            if name in datasets_to_split:
+                for i, split in enumerate(
+                    ds.streaming_split(
+                        world_size, equal=True, locality_hints=locality_hints
+                    )
+                ):
+                    output[i][name] = split
+            else:
+                for i in range(world_size):
+                    output[i][name] = ds.iterator()
+
+        return output
+
+    @staticmethod
+    def default_ingest_options() -> ExecutionOptions:
+        """The default Ray Data options used for data ingest.
+
+        By default, configurations are carried over from what is already set
+        in DataContext.
+        """
+        ctx = ray.data.DataContext.get_current()
+        return ExecutionOptions(
+            # TODO(hchen): Re-enable `locality_with_output` by default after fixing
+            # https://github.com/ray-project/ray/issues/40607
+            locality_with_output=ctx.execution_options.locality_with_output,
+            resource_limits=ctx.execution_options.resource_limits,
+            exclude_resources=ctx.execution_options.exclude_resources,
+            preserve_order=ctx.execution_options.preserve_order,
+            verbose_progress=ctx.execution_options.verbose_progress,
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a159cfe7b9e141f1d25cfdf160285505efeb80b8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py
@@ -0,0 +1,103 @@
+import abc
+from typing import Dict, Optional, TypeVar, Union
+
+import numpy as np
+import pandas as pd
+
+from ray.air.util.data_batch_conversion import (
+    BatchFormat,
+    _convert_batch_type_to_pandas,
+    _convert_pandas_to_batch_type,
+)
+from ray.train.predictor import Predictor
+from ray.util.annotations import DeveloperAPI
+
+TensorType = TypeVar("TensorType")
+TensorDtype = TypeVar("TensorDtype")
+
+
+class DLPredictor(Predictor):
+    @abc.abstractmethod
+    def _arrays_to_tensors(
+        self,
+        numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> Union[TensorType, Dict[str, TensorType]]:
+        """Converts a NumPy ndarray batch to the tensor type for the DL framework.
+
+        Args:
+            numpy_array: The numpy array to convert to a tensor.
+            dtype: The tensor dtype to use when creating the DL tensor.
+            ndarray: A (dict of) NumPy ndarray(s) that we wish to convert to a (dict of)
+                tensor(s).
+            dtype: A (dict of) tensor dtype(s) to use when creating the DL tensor; if
+                None, the dtype will be inferred from the NumPy ndarray data.
+
+        Returns:
+            A deep learning framework specific tensor.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _tensor_to_array(self, tensor: TensorType) -> np.ndarray:
+        """Converts tensor framework specific tensor to a numpy array.
+
+        Args:
+            tensor: A framework specific tensor.
+
+        Returns:
+            A numpy array representing the input tensor.
+        """
+
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    @DeveloperAPI
+    def call_model(
+        self, inputs: Union[TensorType, Dict[str, TensorType]]
+    ) -> Union[TensorType, Dict[str, TensorType]]:
+        """Inputs the tensor to the model for this Predictor and returns the result.
+
+        Args:
+            inputs: The tensor to input to the model.
+
+        Returns:
+            A tensor or dictionary of tensors containing the model output.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    @DeveloperAPI
+    def preferred_batch_format(cls) -> BatchFormat:
+        return BatchFormat.NUMPY
+
+    def _predict_pandas(
+        self,
+        data: pd.DataFrame,
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> pd.DataFrame:
+        numpy_input = _convert_pandas_to_batch_type(
+            data,
+            BatchFormat.NUMPY,
+            self._cast_tensor_columns,
+        )
+        numpy_output = self._predict_numpy(numpy_input, dtype)
+        return _convert_batch_type_to_pandas(numpy_output)
+
+    def _predict_numpy(
+        self,
+        data: Union[np.ndarray, Dict[str, np.ndarray]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        # Single column selection return numpy array so preprocessors can be
+        # reused in both training and prediction
+        if isinstance(data, dict) and len(data) == 1:
+            data = next(iter(data.values()))
+        model_input = self._arrays_to_tensors(data, dtype)
+        model_output = self.call_model(model_input)
+        # TODO (jiaodong): Investigate perf implication of this.
+        # Move DL Tensor to CPU and convert to numpy.
+        if isinstance(model_output, dict):
+            return {k: self._tensor_to_array(v) for k, v in model_output.items()}
+        else:
+            return {"predictions": self._tensor_to_array(model_output)}
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..26259214d84cd1643ea40751cbc5b7ebdbf3d995
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py
@@ -0,0 +1,45 @@
+from typing import Optional
+
+import ray.cloudpickle as ray_pickle
+from ray._private.utils import binary_to_hex, hex_to_binary
+from ray.data.preprocessor import Preprocessor
+from ray.train._checkpoint import Checkpoint
+
+PREPROCESSOR_KEY = "preprocessor_pkl"
+
+
+class FrameworkCheckpoint(Checkpoint):
+    """A checkpoint to preserve the functionality of legacy
+    framework-specific checkpoints.
+
+    Example:
+
+        >>> import tempfile
+        >>> checkpoint = FrameworkCheckpoint(tempfile.mkdtemp())
+        >>> checkpoint.get_preprocessor() is None
+        True
+        >>> preprocessor = Preprocessor()
+        >>> preprocessor._attr = 1234
+        >>> checkpoint.set_preprocessor(preprocessor)
+        >>> checkpoint.get_preprocessor()._attr
+        1234
+    """
+
+    def get_preprocessor(self) -> Optional[Preprocessor]:
+        """Return the preprocessor stored in the checkpoint.
+
+        Returns:
+            The preprocessor stored in the checkpoint, or ``None`` if no
+            preprocessor was stored.
+        """
+        metadata = self.get_metadata()
+        preprocessor_bytes = metadata.get(PREPROCESSOR_KEY)
+        if preprocessor_bytes is None:
+            return None
+        return ray_pickle.loads(hex_to_binary(preprocessor_bytes))
+
+    def set_preprocessor(self, preprocessor: Preprocessor):
+        """Store a preprocessor with the checkpoint."""
+        self.update_metadata(
+            {PREPROCESSOR_KEY: binary_to_hex(ray_pickle.dumps(preprocessor))}
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/session.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/session.py
new file mode 100644
index 0000000000000000000000000000000000000000..f142685caaab7f24f2fdb51938efa06eb9d6c4b1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/session.py
@@ -0,0 +1,1163 @@
+import functools
+import logging
+import os
+import platform
+import queue
+import sys
+import threading
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Set, Type
+
+import ray
+from ray.air._internal.util import RunnerThread, StartTraceback
+from ray.air.constants import (
+    _ERROR_FETCH_TIMEOUT,
+    _RESULT_FETCH_TIMEOUT,
+    SESSION_MISUSE_LOG_ONCE_KEY,
+    TIME_THIS_ITER_S,
+    TIMESTAMP,
+)
+from ray.data import Dataset
+from ray.train import Checkpoint
+from ray.train._internal.accelerator import Accelerator
+from ray.train._internal.storage import StorageContext
+from ray.train.constants import (
+    CHECKPOINT_DIR_NAME,
+    DETAILED_AUTOFILLED_KEYS,
+    RAY_CHDIR_TO_TRIAL_DIR,
+    TIME_TOTAL_S,
+    WORKER_HOSTNAME,
+    WORKER_NODE_IP,
+    WORKER_PID,
+    _v2_migration_warnings_enabled,
+)
+from ray.train.error import SessionMisuseError
+from ray.train.utils import _log_deprecation_warning
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.util.debug import log_once
+from ray.util.placement_group import _valid_resource_shape
+from ray.util.scheduling_strategies import (
+    PlacementGroupSchedulingStrategy,
+    SchedulingStrategyT,
+)
+
+if TYPE_CHECKING:
+    from ray.data import DataIterator
+    from ray.tune.execution.placement_groups import PlacementGroupFactory
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TrialInfo:
+    """The trial information to propagate to TrainSession."""
+
+    name: str
+    id: str
+    resources: Dict[str, float]
+    logdir: str
+    driver_ip: str
+    driver_node_id: str
+    experiment_name: Optional[str] = None
+    run_id: Optional[str] = None
+
+
+class _FutureTrainingResult:
+    """A future that will be resolved to a `_TrainingResult`.
+
+    This is needed for specific schedulers such as PBT that schedule saves.
+
+    This wrapper should be removed after refactoring PBT to not schedule saves anymore.
+    """
+
+    def __init__(self, future: ray.ObjectRef):
+        self.future = future
+
+    def resolve(self, block: bool = True) -> Optional["_TrainingResult"]:
+        """Resolve into ``_TrainingResult``.
+
+        This will return None for function trainables if no checkpoint has been
+        saved before.
+        """
+        if block:
+            timeout = None
+        else:
+            timeout = 1e-9
+        try:
+            return ray.get(self.future, timeout=timeout)
+        except TimeoutError:
+            # Not ready, yet
+            pass
+        except Exception as exc:
+            logger.error(f"Error resolving result: {exc}")
+
+
+class _TrainingResult:
+    """A (checkpoint, metrics) result reported by the user."""
+
+    def __init__(self, checkpoint: Optional[Checkpoint], metrics: Dict[str, Any]):
+        self.checkpoint = checkpoint
+        self.metrics = metrics
+
+    def __repr__(self) -> str:
+        return f"TrainingResult(checkpoint={self.checkpoint}, metrics={self.metrics})"
+
+
+# TODO(xwjiang): This needs a better name.
+@DeveloperAPI
+class _TrainSession:
+    """Holds information for training on each worker."""
+
+    def __init__(
+        self,
+        training_func: Callable,
+        world_rank: Optional[int],
+        local_rank: Optional[int],
+        node_rank: Optional[int],
+        local_world_size: Optional[int],
+        world_size: Optional[int],
+        trial_info: Optional[TrialInfo] = None,
+        dataset_shard: Optional[Dict[str, Dataset]] = None,
+        metadata: Dict[str, Any] = None,
+        checkpoint: Optional[Checkpoint] = None,
+        detailed_autofilled_metrics: bool = False,
+        storage: Optional[StorageContext] = None,
+        synchronous_result_reporting: bool = False,
+    ):
+        # `synchronous_result_reporting` refers to whether or not the
+        # training function is immediately unblocked to continue running
+        # after the main thread receives its result.
+        # Ex 1: For 2 Ray Train workers with synchronous_result_reporting=True,
+        # the worker that produces a result first will immediately will continue
+        # onto the next iteration.
+        # Ex 2: For a Tune function Trainable with `synchronous_result_reporting=False`,
+        # training will only continue with an explicit call to `session.get_next`.
+        # Synchronous reporting in example 2 is needed for Tune schedulers to
+        # be able to stop the execution of the training function at will,
+        # for advanced pausing schedulers (PBT, BOHB) and actor reuse.
+        self.synchronous_result_reporting = synchronous_result_reporting
+
+        # Ray Train worker properties
+        # Note: These are set to None for Tune function Trainables.
+        self.dataset_shard = dataset_shard
+        self.metadata = metadata
+
+        self.world_rank = world_rank
+        self.local_rank = local_rank
+        self.node_rank = node_rank
+        self.local_world_size = local_world_size
+        self.world_size = world_size
+
+        assert storage
+        logger.debug(f"StorageContext on SESSION (rank={world_rank}):\n{storage}")
+
+        # NOTE: `reset` will initialize many properties needed to start running the
+        # training_func as a thread.
+        self.reset(
+            training_func=training_func,
+            trial_info=trial_info,
+            storage=storage,
+            loaded_checkpoint=checkpoint,
+        )
+
+        # Autofilled metrics attributes.
+        self.detailed_autofilled_metrics = detailed_autofilled_metrics
+        self.last_report_time = time.time()
+        self.iteration = 0
+        self.time_total = 0.0
+        self.local_ip = self.get_current_ip()
+
+        self.accelerator = None
+        self._state = {}
+
+    def get_state(self, key: str) -> Any:
+        return self._state.get(key)
+
+    def set_state(self, key: str, value: Any):
+        self._state[key] = value
+
+    def get_current_ip(self):
+        self.local_ip = ray.util.get_node_ip_address()
+        return self.local_ip
+
+    def start(self):
+        """Starts the training thread."""
+        self.training_started = True
+        self.training_thread.start()
+
+    def reset(
+        self,
+        training_func: Callable,
+        trial_info: TrialInfo,
+        storage: StorageContext,
+        loaded_checkpoint=None,
+    ):
+        # This lock is used to control the execution of the training thread.
+        self.continue_lock = threading.Semaphore(0)
+
+        # This event is used to signal the training thread to stop.
+        self.stop_event = threading.Event()
+
+        # Queue for sending results across threads.
+        self.result_queue = queue.Queue(1)
+
+        # Queue for raising exceptions from runner thread to main thread.
+        # The error queue has a max size of one to prevent stacking error and force
+        # error reporting to block until finished.
+        self.error_queue = queue.Queue(1)
+
+        # The Thread object that is running the training function.
+        self.training_thread = RunnerThread(
+            target=training_func, daemon=True, error_queue=self.error_queue
+        )
+
+        # Possibly override with new state
+        self.trial_info = trial_info
+        self.storage = storage
+        self.loaded_checkpoint = loaded_checkpoint
+
+        # Reset state
+        self._state = {}
+        self.ignore_report = False
+        self.training_started = False
+        self._first_report = True
+
+        # Change the working directory to a special trial folder.
+        # This is to ensure that all Ray Train workers have a common working directory.
+        os.makedirs(storage.trial_working_directory, exist_ok=True)
+        if bool(int(os.environ.get(RAY_CHDIR_TO_TRIAL_DIR, "1"))):
+            logger.debug(
+                f"Changing the working directory to: {storage.trial_working_directory}"
+            )
+            os.chdir(storage.trial_working_directory)
+
+    def pause_reporting(self):
+        """Ignore all future ``session.report()`` calls."""
+        self.ignore_report = True
+
+    def finish(self, timeout: Optional[float] = None) -> Optional[Any]:
+        """Finishes the training thread.
+
+        Raises any Exception from training.
+        """
+        # Set the stop event for the training thread to gracefully exit.
+        self.stop_event.set()
+
+        # Release the lock so that training thread can process this event.
+        self.continue_lock.release()
+
+        # Force a final (blocking) sync of artifacts in the trial path to storage.
+        self.storage.persist_artifacts(force=True)
+
+        # Wait for training to finish.
+        # This will raise any errors that occur during training, including SystemError
+        # This returns the result of the training function.
+        output = None
+        if self.training_started:
+            output = self.training_thread.join(timeout=timeout)
+
+        return output
+
+    def get_next(self) -> Optional[_TrainingResult]:
+        """Gets the next ``_TrainingResult`` from the result queue.
+
+        If the result queue is empty, then this function returns ``None``.
+        """
+        if not self.training_started:
+            raise RuntimeError("Please call start before calling get_next.")
+
+        if self.synchronous_result_reporting:
+            # There's no need to release the lock on the first report
+            # since `start` already started the training thread.
+            if not self._first_report:
+                # Release the lock to trigger training to continue,
+                # until the next call to report.
+                self.continue_lock.release()
+            self._first_report = False
+
+        result = None
+        # While training is still ongoing, attempt to get the result.
+        while result is None and self.training_thread.is_alive():
+            try:
+                result = self.result_queue.get(
+                    block=True, timeout=_RESULT_FETCH_TIMEOUT
+                )
+            except queue.Empty:
+                pass
+
+        # If no result was found, then the runner must no longer be alive.
+        if result is None:
+            # Try one last time to fetch results in case results were
+            # reported in between the time of the last check and the
+            # termination of the thread runner.
+            try:
+                result = self.result_queue.get(
+                    block=False, timeout=_RESULT_FETCH_TIMEOUT
+                )
+            except queue.Empty:
+                pass
+
+        # check if error occurred inside the thread runner.
+        if result is None:
+            # only raise an error from the runner if all results are consumed
+            self._report_thread_runner_error(block=True)
+        else:
+            if not self.error_queue.empty():
+                logger.debug(
+                    (
+                        "Runner error waiting to be raised in main thread. "
+                        "Logging all available results first."
+                    )
+                )
+
+        if not self.synchronous_result_reporting:
+            # At this point, the training thread has reached
+            # the `train.report` and is blocked there.
+            # If performing asynchronous result reporting,
+            # release the lock to allow each worker to keep training
+            # immediately after the coordinator fetches their result.
+            self.continue_lock.release()
+
+        # Return None if there are no more results to fetch.
+        return result
+
+    def _auto_fill_metrics(self, result: dict) -> dict:
+        """Add autofilled metrics and update attributes."""
+        current_time = time.time()
+        current_datetime = datetime.now()
+        if TIME_THIS_ITER_S in result:
+            time_this_iter = result[TIME_THIS_ITER_S]
+        else:
+            time_this_iter = current_time - self.last_report_time
+        self.iteration += 1
+        self.time_total += time_this_iter
+        self.last_report_time = current_time
+
+        auto_filled_metrics = {
+            TIMESTAMP: int(time.mktime(current_datetime.timetuple())),
+            TIME_TOTAL_S: self.time_total,
+            WORKER_PID: os.getpid(),
+            WORKER_HOSTNAME: platform.node(),
+            WORKER_NODE_IP: self.local_ip,
+        }
+
+        if not self.detailed_autofilled_metrics:
+            auto_filled_metrics = {
+                k: v
+                for k, v in auto_filled_metrics.items()
+                if k not in DETAILED_AUTOFILLED_KEYS
+            }
+
+        result = result.copy()
+        result.update(auto_filled_metrics)
+        return result
+
+    def _auto_fill_checkpoint_metrics(self, result: dict) -> dict:
+        """Add autofilled metrics and update attributes."""
+        current_datetime = datetime.now()
+
+        auto_filled_metrics = {
+            TIMESTAMP: int(time.mktime(current_datetime.timetuple()))
+        }
+        result = result.copy()
+        result.update(auto_filled_metrics)
+        return result
+
+    def _report_thread_runner_error(self, block=False):
+        try:
+            e = self.error_queue.get(block=block, timeout=_ERROR_FETCH_TIMEOUT)
+            raise StartTraceback from e
+        except queue.Empty:
+            pass
+
+    def _report_training_result(self, training_result: _TrainingResult) -> None:
+        """Place a training result on the result queue for the main thread to process,
+        then block until the main thread signals that training should continue.
+
+        NOTE: This is used internally to report results from Train to Tune
+        without persisting checkpoints to storage 2 times.
+        `report` is the public API that directly persists to storage, which
+        should only be called by user code.
+        """
+        if training_result.checkpoint:
+            # NOTE: This populates `train.get_checkpoint`
+            self.loaded_checkpoint = training_result.checkpoint
+
+        # Add result to a thread-safe queue.
+        self.result_queue.put(training_result, block=True)
+
+        # Acquire lock to stop the training thread until main thread
+        # triggers resume.
+        self.continue_lock.acquire()
+
+        # If the trial should be terminated, exit gracefully.
+        # NOTE: This is only really useful if `synchronous_result_reporting=True`.
+        # Otherwise, the lock is immediately released on reporting, and this
+        # check is skipped before the main thread decides to set the stop event.
+        if self.stop_event.is_set():
+            self.stop_event.clear()
+            sys.exit(0)
+
+    def report(self, metrics: Dict, checkpoint: Optional[Checkpoint] = None) -> None:
+        # Special case: early fail for Torch tensors
+        if "torch" in sys.modules:
+            from ray.air._internal.torch_utils import contains_tensor
+
+            if contains_tensor(metrics):
+                raise ValueError(
+                    "Passing objects containg Torch tensors as metrics "
+                    "is not supported as it will throw an exception on "
+                    "deserialization. You can either convert the tensors "
+                    "to Python objects or report a `train.Checkpoint` "
+                    "with `ray.train.report` to store your Torch objects."
+                )
+
+        if self.ignore_report:
+            return
+
+        metrics = self._auto_fill_metrics(metrics)
+
+        persisted_checkpoint = None
+        if checkpoint:
+            self.storage._update_checkpoint_index(metrics)
+
+            # Persist the reported checkpoint files to storage.
+            persisted_checkpoint = self.storage.persist_current_checkpoint(checkpoint)
+
+            metrics[CHECKPOINT_DIR_NAME] = self.storage.checkpoint_dir_name
+        else:
+            metrics[CHECKPOINT_DIR_NAME] = None
+
+        # Persist trial artifacts to storage.
+        force_artifact_sync = (
+            persisted_checkpoint
+            and self.storage.sync_config.sync_artifacts_on_checkpoint
+        )
+        self.storage.persist_artifacts(force=force_artifact_sync)
+
+        # Set additional user metadata from the Trainer.
+        if persisted_checkpoint and self.metadata:
+            user_metadata = persisted_checkpoint.get_metadata()
+            for k, v in self.metadata.items():
+                # Update keys not already set by the user. This gives user-set keys
+                # precedence over keys set at the Trainer level.
+                if k not in user_metadata:
+                    user_metadata[k] = v
+            persisted_checkpoint.set_metadata(user_metadata)
+
+        result = _TrainingResult(checkpoint=persisted_checkpoint, metrics=metrics)
+
+        self._report_training_result(result)
+
+    @property
+    def experiment_name(self) -> str:
+        return self.trial_info.experiment_name
+
+    @property
+    def trial_name(self) -> str:
+        return self.trial_info.name
+
+    @property
+    def trial_id(self) -> str:
+        return self.trial_info.id
+
+    @property
+    def run_id(self) -> str:
+        return self.trial_info.run_id
+
+    @property
+    def trial_resources(self) -> "PlacementGroupFactory":
+        return self.trial_info.resources
+
+    @property
+    def trial_dir(self) -> str:
+        return self.trial_info.logdir
+
+    def get_dataset_shard(
+        self,
+        dataset_name: Optional[str] = None,
+    ) -> Optional["DataIterator"]:
+        shard = self.dataset_shard
+        if shard is None:
+            warnings.warn(
+                "No dataset passed in. Returning None. Make sure to "
+                "pass in a Dataset to Trainer.run to use this "
+                "function."
+            )
+        elif isinstance(shard, dict):
+            if not dataset_name:
+                raise RuntimeError(
+                    "Multiple datasets were passed into ``Trainer``, "
+                    "but no ``dataset_name`` is passed into "
+                    "``get_dataset_shard``. Please specify which "
+                    "dataset shard to retrieve."
+                )
+            return shard.get(dataset_name)
+        return shard
+
+
+# Cache of resource dicts that have been checked by the launch hook already.
+_checked_resources: Set[frozenset] = set()
+
+# Global _TrainSession object initialized by Ray Tune function trainables
+# and Ray Train V1 workers.
+_session: Optional[_TrainSession] = None
+
+
+def _tune_task_and_actor_launch_hook(
+    fn, resources: Dict[str, float], strategy: Optional[SchedulingStrategyT]
+):
+    """Launch hook to catch nested tasks that can't fit in the placement group.
+
+    This gives users a nice warning in case they launch a nested task in a Tune trial
+    without reserving resources in the trial placement group to fit it.
+    """
+
+    # Already checked, skip for performance reasons.
+    key = frozenset({(k, v) for k, v in resources.items() if v > 0})
+    if not key or key in _checked_resources:
+        return
+
+    # No need to check if placement group is None.
+    if (
+        not isinstance(strategy, PlacementGroupSchedulingStrategy)
+        or strategy.placement_group is None
+    ):
+        return
+
+    # Check if the resource request is targeting the current placement group.
+    cur_pg = ray.util.get_current_placement_group()
+    if not cur_pg or strategy.placement_group.id != cur_pg.id:
+        return
+
+    _checked_resources.add(key)
+
+    # Check if the request can be fulfilled by the current placement group.
+    pgf = get_trial_resources()
+
+    if pgf.head_bundle_is_empty:
+        available_bundles = cur_pg.bundle_specs[0:]
+    else:
+        available_bundles = cur_pg.bundle_specs[1:]
+
+    # Check if the request can be fulfilled by the current placement group.
+    if _valid_resource_shape(resources, available_bundles):
+        return
+
+    if fn.class_name:
+        submitted = "actor"
+        name = fn.module_name + "." + fn.class_name + "." + fn.function_name
+    else:
+        submitted = "task"
+        name = fn.module_name + "." + fn.function_name
+
+    # Normalize the resource spec so it looks the same as the placement group bundle.
+    main_resources = cur_pg.bundle_specs[0]
+    resources = {k: float(v) for k, v in resources.items() if v > 0}
+
+    raise RuntimeError(
+        f"No trial resources are available for launching the {submitted} `{name}`. "
+        "To resolve this, specify the Tune option:\n\n"
+        ">  resources_per_trial=tune.PlacementGroupFactory(\n"
+        f">    [{main_resources}] + [{resources}] * N\n"
+        ">  )\n\n"
+        f"Where `N` is the number of slots to reserve for trial {submitted}s. "
+        "If you are using a Ray training library, there might be a utility function "
+        "to set this automatically for you. For more information, refer to "
+        "https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html"
+    )
+
+
+def init_session(*args, **kwargs) -> None:
+    global _session
+    if _session:
+        raise ValueError(
+            "A Train session is already in use. Do not call "
+            "`init_session()` manually."
+        )
+
+    # Setup hooks for generating placement group resource deadlock warnings.
+    from ray import actor, remote_function
+
+    if "TUNE_DISABLE_RESOURCE_CHECKS" not in os.environ:
+        actor._actor_launch_hook = _tune_task_and_actor_launch_hook
+        remote_function._task_launch_hook = _tune_task_and_actor_launch_hook
+
+    _session = _TrainSession(*args, **kwargs)
+
+
+def get_session() -> Optional[_TrainSession]:
+    return _session
+
+
+def shutdown_session():
+    """Shuts down the initialized session."""
+    global _session
+    _session = None
+
+
+def _raise_accelerator_session_misuse():
+    """Raises a SessionMisuseError because a utility function was used improperly."""
+    raise SessionMisuseError(
+        "prepare/accelerate utility functions should be called inside a training "
+        "function executed by `Trainer.run`"
+    )
+
+
+def get_accelerator(default_accelerator_cls: Type[Accelerator]) -> Accelerator:
+    """The accelerator for this training session.
+
+    If an accelerator has not been set, then this method will construct an
+    accelerator using the provided accelerator class.
+
+    Raises:
+        SessionMisuseError: if the session is uninitialized.
+    """
+    session = get_session()
+    if session is None:
+        _raise_accelerator_session_misuse()
+    if session.accelerator is None:
+        session.accelerator = default_accelerator_cls()
+    return session.accelerator
+
+
+def set_accelerator(accelerator: Accelerator) -> None:
+    """Sets the accelerator for this training session.
+
+    Args:
+        accelerator: The accelerator to use for training.
+
+    Raises:
+        SessionMisuseError: if the session is unitialized.
+        RuntimeError: if the accelerator has already been set.
+    """
+    session = get_session()
+    if session is None:
+        _raise_accelerator_session_misuse()
+    if session.accelerator is not None:
+        raise RuntimeError("Cannot change accelerator once set.")
+    session.accelerator = accelerator
+
+
+def _warn_session_misuse(default_value: Any = None):
+    """Warns if fn is being used outside of session and returns ``default_value``."""
+
+    def inner(fn: Callable):
+        fn_name = fn.__name__
+
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            session = get_session()
+            if not session:
+                if log_once(f"{SESSION_MISUSE_LOG_ONCE_KEY}-{fn_name}"):
+                    warnings.warn(
+                        f"`{fn_name}` is meant to only be "
+                        "called inside a function that is executed by a Tuner"
+                        f" or Trainer. Returning `{default_value}`."
+                    )
+                return default_value
+            return fn(*args, **kwargs)
+
+        return wrapper
+
+    return inner
+
+
+@PublicAPI(stability="stable")
+@_warn_session_misuse()
+def report(metrics: Dict, *, checkpoint: Optional[Checkpoint] = None) -> None:
+    """Report metrics and optionally save a checkpoint.
+
+    If a checkpoint is provided, it will be
+    :ref:`persisted to storage <persistent-storage-guide>`.
+
+    If this is called in multiple distributed training workers:
+
+    - Only the metrics reported by the rank 0 worker will be tracked by Ray Train.
+      See :ref:`the metrics logging guide <train-monitoring-and-logging>`.
+    - A checkpoint will be registered as long as one or more workers reports
+      checkpoint that is not None.
+      See the :ref:`checkpointing guide <train-dl-saving-checkpoints>`.
+    - Checkpoints from multiple workers will be merged into one directory
+      in persistent storage.
+      See :ref:`the distributed checkpointing guide <train-distributed-checkpointing>`.
+
+    .. note::
+
+        Each invocation of this method will automatically increment the underlying
+        ``training_iteration`` number. The physical meaning of this "iteration" is
+        defined by user depending on how often they call ``report``.
+        It does not necessarily map to one epoch.
+
+    .. warning::
+
+        All workers must call `ray.train.report` the same number of times
+        so that Ray Train can properly synchronize the training state across
+        workers. Otherwise, your training will hang.
+
+    .. warning::
+
+        This method does NOT act as a barrier for distributed training workers.
+        Workers will upload their checkpoint, then continue training immediately.
+        If you need to synchronize workers, you can use a framework-native barrier
+        such as `torch.distributed.barrier()`.
+
+    Example:
+
+        .. testcode::
+
+            import tempfile
+
+            from ray import train
+            from ray.train import Checkpoint
+            from ray.train.torch import TorchTrainer
+
+
+            def train_func(config):
+                start_epoch = 0
+                checkpoint = train.get_checkpoint()
+                if checkpoint:
+                    with checkpoint.as_directory() as checkpoint_dir:
+                        # Load back training state
+                        ...
+
+                for epoch in range(start_epoch, config.get("num_epochs", 10)):
+                    # Do training...
+
+                    metrics = {"loss": ...}
+
+                    with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                       # Save the checkpoint...
+                       # torch.save(...)
+
+                        checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+
+                        # Example: Only the rank 0 worker uploads the checkpoint.
+                        if ray.train.get_context().get_world_rank() == 0:
+                            train.report(metrics, checkpoint=checkpoint)
+                        else:
+                            train.report(metrics, checkpoint=None)
+
+            trainer = TorchTrainer(
+                train_func, scaling_config=train.ScalingConfig(num_workers=2)
+            )
+
+    Args:
+        metrics: The metrics you want to report.
+        checkpoint: The optional checkpoint you want to report.
+    """
+    # If we are running in a Tune function, switch to `ray.tune.report`.
+    from ray.tune.trainable.trainable_fn_utils import _in_tune_session
+
+    if _in_tune_session():
+        import ray.tune
+
+        if _v2_migration_warnings_enabled():
+            _log_deprecation_warning(
+                "`ray.train.report` should be switched to "
+                "`ray.tune.report` when running in a function "
+                "passed to Ray Tune. This will be an error in the future."
+            )
+        return ray.tune.report(metrics, checkpoint=checkpoint)
+
+    get_session().report(metrics, checkpoint=checkpoint)
+
+
+@PublicAPI(stability="stable")
+@_warn_session_misuse()
+def get_checkpoint() -> Optional[Checkpoint]:
+    """Access the latest reported checkpoint to resume from if one exists.
+
+    Example:
+
+        .. testcode::
+
+            import tempfile
+
+            from ray import train
+            from ray.train import Checkpoint
+            from ray.train.torch import TorchTrainer
+
+
+            def train_func(config):
+                start_epoch = 0
+                checkpoint = train.get_checkpoint()
+                if checkpoint:
+                    with checkpoint.as_directory() as checkpoint_dir:
+                        # Load back training state
+                        ...
+
+                for epoch in range(start_epoch, config.get("num_epochs", 10)):
+                    # Do training...
+
+                    metrics = {"loss": ...}
+
+                    with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                       # Save the checkpoint...
+
+                        checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+                        train.report(metrics, checkpoint=checkpoint)
+
+            trainer = TorchTrainer(
+                train_func, scaling_config=train.ScalingConfig(num_workers=2)
+            )
+
+    Returns:
+        Checkpoint object if the session is currently being resumed.
+            Otherwise, return None.
+    """
+    # If we are running in a Tune function, switch to `ray.tune.get_checkpoint`.
+    from ray.tune.trainable.trainable_fn_utils import _in_tune_session
+
+    if _in_tune_session():
+        import ray.tune
+
+        if _v2_migration_warnings_enabled():
+            _log_deprecation_warning(
+                "`ray.train.get_checkpoint` should be switched to "
+                "`ray.tune.get_checkpoint` when running in a function "
+                "passed to Ray Tune. This will be an error in the future."
+            )
+        return ray.tune.get_checkpoint()
+
+    return get_session().loaded_checkpoint
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_metadata() -> Dict[str, Any]:
+    """User metadata dict passed to the Trainer constructor."""
+    return get_session().metadata
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_experiment_name() -> str:
+    """Experiment name for the corresponding trial."""
+    return get_session().experiment_name
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_name() -> str:
+    """Trial name for the corresponding trial."""
+    return get_session().trial_name
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_id() -> str:
+    """Trial id for the corresponding trial."""
+    return get_session().trial_id
+
+
+@PublicAPI(stability="alpha")
+@_warn_session_misuse()
+def get_run_id() -> str:
+    """Unique Train Run id for the corresponding trial."""
+    return get_session().run_id
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_resources() -> "PlacementGroupFactory":
+    """Trial resources for the corresponding trial."""
+    return get_session().trial_resources
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse()
+def get_trial_dir() -> str:
+    """Log directory corresponding to the trial directory for a Tune session.
+    If calling from a Train session, this will give the trial directory of its parent
+    Tune session.
+
+    .. testcode::
+
+        from ray import train, tune
+
+        def train_func(config):
+            print(train.get_context().get_trial_dir())
+
+        tuner = tune.Tuner(train_func)
+        tuner.fit()
+
+    .. testoutput::
+        :options: +MOCK
+
+        /Users/root/ray_results/train_func_2023-07-19_15-01-37/train_func_d620c_00000_0_2023-07-19_15-01-40
+    """
+    return get_session().trial_dir
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=1)
+def get_world_size() -> int:
+    """Get the current world size (i.e. total number of workers) for this run.
+
+    .. testcode::
+
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.tensorflow import TensorflowTrainer
+
+        NUM_WORKERS = 2
+
+        def train_loop_per_worker(config):
+            assert train.get_context().get_world_size() == NUM_WORKERS
+
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TensorflowTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=NUM_WORKERS),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+
+    .. testoutput::
+        :hide:
+
+        ...
+    """
+    session = get_session()
+    if not hasattr(session, "world_size"):
+        raise RuntimeError(
+            "`get_world_size` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.world_size
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_world_rank() -> int:
+    """Get the world rank of this worker.
+
+    .. testcode::
+
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.tensorflow import TensorflowTrainer
+
+        def train_loop_per_worker(config):
+            if train.get_context().get_world_rank() == 0:
+                print("Worker 0")
+
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TensorflowTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=2),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+
+    .. testoutput::
+        :hide:
+
+        ...
+    """
+    session = get_session()
+    if not hasattr(session, "world_rank"):
+        raise RuntimeError(
+            "`get_world_rank` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.world_rank
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_local_rank() -> int:
+    """Get the local rank of this worker (rank of the worker on its node).
+
+    .. testcode::
+
+        import torch
+
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.torch import TorchTrainer
+
+        def train_loop_per_worker(config):
+            if torch.cuda.is_available():
+                torch.cuda.set_device(train.get_context().get_local_rank())
+            ...
+
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TorchTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=2, use_gpu=True),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+
+    .. testoutput::
+        :hide:
+
+        ...
+    """
+    session = get_session()
+    if not hasattr(session, "local_rank"):
+        raise RuntimeError(
+            "`get_local_rank` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.local_rank
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_local_world_size() -> int:
+    """Get the local world size of this node (i.e. number of workers on this node).
+
+    Example:
+
+        .. testcode::
+
+            import ray
+            from ray import train
+            from ray.train import ScalingConfig
+            from ray.train.torch import TorchTrainer
+
+            def train_loop_per_worker():
+                print(train.get_context().get_local_world_size())
+
+            train_dataset = ray.data.from_items(
+                [{"x": x, "y": x + 1} for x in range(32)])
+            trainer = TorchTrainer(train_loop_per_worker,
+                scaling_config=ScalingConfig(num_workers=1),
+                datasets={"train": train_dataset})
+            trainer.fit()
+
+        .. testoutput::
+            :hide:
+
+            ...
+    """
+    session = get_session()
+    if not hasattr(session, "local_world_size"):
+        raise RuntimeError(
+            "`get_local_world_size` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.local_world_size
+
+
+@PublicAPI(stability="beta")
+@_warn_session_misuse(default_value=0)
+def get_node_rank() -> int:
+    """Get the rank of this node.
+
+    Example:
+
+        .. testcode::
+
+            import ray
+            from ray import train
+            from ray.train import ScalingConfig
+            from ray.train.torch import TorchTrainer
+
+            def train_loop_per_worker():
+                print(train.get_context().get_node_rank())
+
+            train_dataset = ray.data.from_items(
+                [{"x": x, "y": x + 1} for x in range(32)])
+            trainer = TorchTrainer(train_loop_per_worker,
+                scaling_config=ScalingConfig(num_workers=1),
+                datasets={"train": train_dataset})
+            trainer.fit()
+
+        .. testoutput::
+            :hide:
+
+            ...
+    """
+    session = get_session()
+    if not hasattr(session, "node_rank"):
+        raise RuntimeError(
+            "`get_node_rank` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.node_rank
+
+
+@PublicAPI(stability="stable")
+@_warn_session_misuse()
+def get_dataset_shard(
+    dataset_name: Optional[str] = None,
+) -> Optional["DataIterator"]:
+    """Returns the :class:`ray.data.DataIterator` shard for this worker.
+
+    Call :meth:`~ray.data.DataIterator.iter_torch_batches` or
+    :meth:`~ray.data.DataIterator.to_tf` on this shard to convert it to the
+    appropriate framework-specific data type.
+
+    .. testcode::
+
+        import ray
+        from ray import train
+        from ray.train import ScalingConfig
+        from ray.train.torch import TorchTrainer
+
+        def train_loop_per_worker(config):
+            ...
+            for epoch in range(2):
+                # Trainer will automatically handle sharding.
+                data_shard = train.get_dataset_shard("train")
+                for batch in data_shard.iter_torch_batches():
+                    ...
+
+        train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+        trainer = TorchTrainer(
+            train_loop_per_worker,
+            scaling_config=ScalingConfig(num_workers=2),
+            datasets={"train": train_dataset}
+        )
+        trainer.fit()
+
+    .. testoutput::
+        :hide:
+
+        ...
+
+    Args:
+        dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then
+            specifies which dataset shard to return.
+
+    Returns:
+        The ``DataIterator`` shard to use for this worker.
+        If no dataset is passed into Trainer, then return None.
+    """
+    session = get_session()
+    if not hasattr(session, "get_dataset_shard"):
+        raise RuntimeError(
+            "`get_dataset_shard` can only be called for TrainSession! "
+            "Make sure you only use that in `train_loop_per_worker` function"
+            "that is passed into `DataParallelTrainer`."
+        )
+    return session.get_dataset_shard(dataset_name)
+
+
+@DeveloperAPI
+@_warn_session_misuse()
+def get_storage() -> StorageContext:
+    """Returns the :class:`~ray.train._internal.storage.StorageContext` storage
+    context which gives advanced access to the filesystem and paths
+    configured through `RunConfig`.
+
+    NOTE: This is a developer API, and the `StorageContext` interface may change
+    without notice between minor versions.
+    """
+    return get_session().storage
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..604a4fa3932930fa1d728ceb6e9bdd4619449bf6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py
@@ -0,0 +1,14 @@
+from ray.train._internal.state.state_manager import TrainRunStateManager
+
+try:
+    import pydantic  # noqa: F401
+except ImportError:
+    raise ModuleNotFoundError(
+        "pydantic isn't installed."
+        "To install pydantic, please run 'pip install pydantic'"
+    )
+
+
+__all__ = [
+    "TrainRunStateManager",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94b4bd7b07cc941dba6049ce0133eb70db7a52a3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8a30db76ecf2075376e6b55685fbc0494ede08d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..765fc521e4ef2d1dda66317e29560619ba0bb1e9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5cfda81aecb3eddd7678bceecac47adc66fac6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a4d7e4b2e0b2ae545c14dbe8020372c1d35f6d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py
@@ -0,0 +1,158 @@
+from enum import Enum
+from typing import List, Optional
+
+from ray._private.pydantic_compat import BaseModel, Field
+from ray.dashboard.modules.job.pydantic_models import JobDetails
+from ray.util.annotations import DeveloperAPI
+
+MAX_ERROR_STACK_TRACE_LENGTH = 50000
+
+
+@DeveloperAPI
+class RunStatusEnum(str, Enum):
+    """Enumeration for the status of a train run."""
+
+    # (Deprecated) Replaced by RUNNING.
+    # The train run has started
+    STARTED = "STARTED"
+    # The train run is running
+    RUNNING = "RUNNING"
+    # The train run was terminated as expected
+    FINISHED = "FINISHED"
+    # The train run was terminated early due to errors in the training function
+    ERRORED = "ERRORED"
+    # The train run was terminated early due to system errors or controller errors
+    ABORTED = "ABORTED"
+
+
+@DeveloperAPI
+class ActorStatusEnum(str, Enum):
+    DEAD = "DEAD"
+    ALIVE = "ALIVE"
+
+
+@DeveloperAPI
+class TrainWorkerInfo(BaseModel):
+    """Metadata of a Ray Train worker."""
+
+    actor_id: str = Field(description="Actor ID of the worker.")
+    world_rank: int = Field(description="World rank of the worker.")
+    local_rank: int = Field(description="Local rank of the worker.")
+    node_rank: int = Field(description="Node rank of the worker.")
+    node_id: str = Field(description="ID of the node that the worker is running on.")
+    node_ip: str = Field(
+        description="IP address of the node that the worker is running on."
+    )
+    pid: int = Field(description="Process ID of the worker.")
+    gpu_ids: List[int] = Field(
+        description="A list of GPU ids allocated to that worker."
+    )
+    status: Optional[ActorStatusEnum] = Field(
+        description="The status of the train worker actor. It can be ALIVE or DEAD."
+    )
+
+
+@DeveloperAPI
+class MemoryInfo(BaseModel):
+    rss: int
+    vms: int
+    pfaults: Optional[int]
+    pageins: Optional[int]
+
+
+@DeveloperAPI
+class ProcessStats(BaseModel):
+    cpuPercent: float
+    # total memory, free memory, memory used ratio
+    mem: Optional[List[int]]
+    memoryInfo: MemoryInfo
+
+
+class ProcessGPUUsage(BaseModel):
+    # This gpu usage stats from a process
+    pid: int
+    gpuMemoryUsage: int
+
+
+@DeveloperAPI
+class GPUStats(BaseModel):
+    uuid: str
+    index: int
+    name: str
+    utilizationGpu: Optional[float]
+    memoryUsed: float
+    memoryTotal: float
+    processInfo: ProcessGPUUsage
+
+
+@DeveloperAPI
+class TrainWorkerInfoWithDetails(TrainWorkerInfo):
+    """Metadata of a Ray Train worker."""
+
+    processStats: Optional[ProcessStats] = Field(
+        None, description="Process stats of the worker."
+    )
+    gpus: List[GPUStats] = Field(
+        default_factory=list,
+        description=(
+            "GPU stats of the worker. "
+            "Only returns GPUs that are attached to the worker process."
+        ),
+    )
+
+
+@DeveloperAPI
+class TrainDatasetInfo(BaseModel):
+    name: str = Field(
+        description="The key of the dataset dict specified in Ray Train Trainer."
+    )
+    dataset_uuid: str = Field(description="The uuid of the dataset.")
+    dataset_name: Optional[str] = Field(description="The name of the dataset.")
+
+
+@DeveloperAPI
+class TrainRunInfo(BaseModel):
+    """Metadata for a Ray Train run and information about its workers."""
+
+    name: str = Field(description="The name of the Train run.")
+    id: str = Field(description="The unique identifier for each Train run.")
+    job_id: str = Field(description="The Ray Job ID.")
+    controller_actor_id: str = Field(description="Actor Id of the Train controller.")
+    workers: List[TrainWorkerInfo] = Field(
+        description="A List of Train workers sorted by global ranks."
+    )
+    datasets: List[TrainDatasetInfo] = Field(
+        description="A List of dataset info for this Train run."
+    )
+    run_status: RunStatusEnum = Field(
+        description="The current status of the train run. It can be one of the "
+        "following: RUNNING, FINISHED, ERRORED, or ABORTED."
+    )
+    status_detail: str = Field(
+        description="Detailed information about the current run status, "
+        "such as error messages."
+    )
+    start_time_ms: int = Field(
+        description="The UNIX timestamp of the start time of this Train run."
+    )
+    end_time_ms: Optional[int] = Field(
+        description="The UNIX timestamp of the end time of this Train run. "
+        "If null, the Train run has not ended yet."
+    )
+
+
+@DeveloperAPI
+class TrainRunInfoWithDetails(TrainRunInfo):
+    """Metadata for a Ray Train run and information about its workers."""
+
+    workers: List[TrainWorkerInfoWithDetails] = Field(
+        description="A List of Train workers sorted by global ranks."
+    )
+    job_details: Optional[JobDetails] = Field(
+        None, description="Details of the job that started this Train run."
+    )
+
+
+@DeveloperAPI
+class TrainRunsResponse(BaseModel):
+    train_runs: List[TrainRunInfoWithDetails]
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6404eb231ca193eac315b801587412313575650d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py
@@ -0,0 +1,62 @@
+import logging
+import threading
+from typing import Dict, Optional
+
+import ray
+from ray.actor import ActorHandle
+from ray.train._internal.state.schema import TrainRunInfo
+
+logger = logging.getLogger(__name__)
+
+
+@ray.remote(num_cpus=0)
+class TrainStateActor:
+    def __init__(self):
+        self._run_infos: Dict[str, TrainRunInfo] = {}
+
+    def register_train_run(self, run_info: TrainRunInfo) -> None:
+        # Register a new train run.
+        self._run_infos[run_info.id] = run_info
+
+    def get_train_run(self, run_id: str) -> Optional[TrainRunInfo]:
+        # Retrieve a registered run with its id
+        return self._run_infos.get(run_id, None)
+
+    def get_all_train_runs(self) -> Dict[str, TrainRunInfo]:
+        # Retrieve all registered train runs
+        return self._run_infos
+
+
+TRAIN_STATE_ACTOR_NAME = "train_state_actor"
+TRAIN_STATE_ACTOR_NAMESPACE = "_train_state_actor"
+
+_state_actor_lock: threading.RLock = threading.RLock()
+
+
+def get_or_create_state_actor() -> ActorHandle:
+    """Get or create a `TrainStateActor` on the head node."""
+    with _state_actor_lock:
+        state_actor = TrainStateActor.options(
+            name=TRAIN_STATE_ACTOR_NAME,
+            namespace=TRAIN_STATE_ACTOR_NAMESPACE,
+            get_if_exists=True,
+            lifetime="detached",
+            resources={"node:__internal_head__": 0.001},
+            # Escape from the parent's placement group
+            scheduling_strategy="DEFAULT",
+        ).remote()
+
+    # Ensure the state actor is ready
+    ray.get(state_actor.__ray_ready__.remote())
+    return state_actor
+
+
+def get_state_actor() -> Optional[ActorHandle]:
+    """Get the `TrainStateActor` if exists, otherwise return None."""
+    try:
+        return ray.get_actor(
+            name=TRAIN_STATE_ACTOR_NAME,
+            namespace=TRAIN_STATE_ACTOR_NAMESPACE,
+        )
+    except ValueError:
+        return None
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7183714fc72f7a5b864d0f50fd18b725d0c2413
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py
@@ -0,0 +1,126 @@
+import logging
+import os
+from collections import defaultdict
+from typing import Any, Dict
+
+import ray
+from ray.data import Dataset
+from ray.train._internal.state.schema import (
+    RunStatusEnum,
+    TrainDatasetInfo,
+    TrainRunInfo,
+    TrainWorkerInfo,
+)
+from ray.train._internal.utils import check_for_failure
+from ray.train._internal.worker_group import WorkerGroup
+
+logger = logging.getLogger(__name__)
+
+
+class TrainRunStateManager:
+    """A class that aggregates and reports train run info to TrainStateActor.
+
+    This manager class is created on the train controller layer for each run.
+    """
+
+    def __init__(self, state_actor) -> None:
+        self.state_actor = state_actor
+        self.train_run_info_dict = defaultdict(dict)
+
+    def register_train_run(
+        self,
+        run_id: str,
+        job_id: str,
+        run_name: str,
+        run_status: str,
+        controller_actor_id: str,
+        datasets: Dict[str, Dataset],
+        worker_group: WorkerGroup,
+        start_time_ms: float,
+        status_detail: str = "",
+    ) -> None:
+        """Collect Train Run Info and report to StateActor."""
+
+        if not self.state_actor:
+            logger.warning(
+                "Unable to register train run since `TrainStateActor` is not started."
+            )
+            return
+
+        def collect_train_worker_info():
+            train_context = ray.train.get_context()
+            core_context = ray.runtime_context.get_runtime_context()
+
+            return TrainWorkerInfo(
+                world_rank=train_context.get_world_rank(),
+                local_rank=train_context.get_local_rank(),
+                node_rank=train_context.get_node_rank(),
+                actor_id=core_context.get_actor_id(),
+                node_id=core_context.get_node_id(),
+                node_ip=ray.util.get_node_ip_address(),
+                gpu_ids=ray.get_gpu_ids(),
+                pid=os.getpid(),
+            )
+
+        futures = [
+            worker_group.execute_single_async(index, collect_train_worker_info)
+            for index in range(len(worker_group))
+        ]
+        success, exception = check_for_failure(futures)
+
+        if not success:
+            logger.error(
+                "Failed to collect run information from the Ray Train "
+                f"workers:\n{exception}"
+            )
+            return
+
+        worker_info_list = ray.get(futures)
+        worker_info_list = sorted(worker_info_list, key=lambda info: info.world_rank)
+
+        dataset_info_list = [
+            TrainDatasetInfo(
+                name=ds_name,
+                dataset_name=ds._plan._dataset_name,
+                dataset_uuid=ds._plan._dataset_uuid,
+            )
+            for ds_name, ds in datasets.items()
+        ]
+
+        updates = dict(
+            id=run_id,
+            job_id=job_id,
+            name=run_name,
+            controller_actor_id=controller_actor_id,
+            workers=worker_info_list,
+            datasets=dataset_info_list,
+            start_time_ms=start_time_ms,
+            run_status=run_status,
+            status_detail=status_detail,
+        )
+
+        # Clear the cached info to avoid registering the same run twice
+        self.train_run_info_dict[run_id] = {}
+        self._update_train_run_info(run_id, updates)
+
+    def end_train_run(
+        self,
+        run_id: str,
+        run_status: RunStatusEnum,
+        status_detail: str,
+        end_time_ms: int,
+    ):
+        """Update the train run status when the training is finished."""
+        updates = dict(
+            run_status=run_status,
+            status_detail=status_detail,
+            end_time_ms=end_time_ms,
+        )
+        self._update_train_run_info(run_id, updates)
+
+    def _update_train_run_info(self, run_id: str, updates: Dict[str, Any]) -> None:
+        """Update specific fields of a registered TrainRunInfo instance."""
+        if run_id in self.train_run_info_dict:
+            self.train_run_info_dict[run_id].update(updates)
+            train_run_info = TrainRunInfo(**self.train_run_info_dict[run_id])
+            ray.get(self.state_actor.register_train_run.remote(train_run_info))
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..05970988862e371b2f080fb92e245a116a4596bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py
@@ -0,0 +1,725 @@
+# Try import ray[train] core requirements (defined in setup.py)
+# isort: off
+try:
+    import fsspec  # noqa
+    from fsspec.implementations.local import LocalFileSystem
+
+except (ImportError, ModuleNotFoundError) as e:
+    raise RuntimeError(
+        "fsspec is a required dependency of Ray Train and Ray Tune. "
+        "Please install with: `pip install fsspec`"
+    ) from e
+
+try:
+    import pyarrow
+    import pyarrow.fs
+
+except (ImportError, ModuleNotFoundError) as e:
+    raise RuntimeError(
+        "pyarrow is a required dependency of Ray Train and Ray Tune. "
+        "Please install with: `pip install pyarrow`"
+    ) from e
+
+try:
+    # check if Arrow has S3 support
+    from pyarrow.fs import S3FileSystem
+except ImportError:
+    S3FileSystem = None
+# isort: on
+
+import fnmatch
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
+
+from ray.air._internal.filelock import TempFileLock
+from ray.train._internal.syncer import SyncConfig, Syncer, _BackgroundSyncer
+from ray.train.constants import _get_ray_train_session_dir
+from ray.util.annotations import DeveloperAPI
+
+if TYPE_CHECKING:
+    from ray.train._checkpoint import Checkpoint
+
+
+logger = logging.getLogger(__name__)
+
+
+_VALIDATE_STORAGE_MARKER_FILENAME = ".validate_storage_marker"
+
+
+class _ExcludingLocalFilesystem(LocalFileSystem):
+    """LocalFileSystem wrapper to exclude files according to patterns.
+
+    Args:
+        root_path: Root path to strip when matching with the exclude pattern.
+            Ex: root_path="/tmp/a/b/c", exclude=["*a*"], will exclude
+            /tmp/a/b/c/_a_.txt but not ALL of /tmp/a/*.
+        exclude: List of patterns that are applied to files returned by
+            ``self.find()``. If a file path matches this pattern, it will
+            be excluded.
+
+    """
+
+    def __init__(self, root_path: Path, exclude: List[str], **kwargs):
+        super().__init__(**kwargs)
+        self._exclude = exclude
+        self._root_path = root_path
+
+    @property
+    def fsid(self):
+        return "_excluding_local"
+
+    def _should_exclude(self, path: str) -> bool:
+        """Return True if `path` (relative to `root_path`) matches any of the
+        `self._exclude` patterns."""
+        path = Path(path)
+        relative_path = path.relative_to(self._root_path).as_posix()
+        match_candidates = [relative_path]
+        if path.is_dir():
+            # Everything is in posix path format ('/')
+            match_candidates.append(relative_path + "/")
+
+        for excl in self._exclude:
+            if any(fnmatch.fnmatch(candidate, excl) for candidate in match_candidates):
+                return True
+        return False
+
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        """Call parent find() and exclude from result."""
+        paths = super().find(
+            path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+        )
+        if detail:
+            return {
+                path: out
+                for path, out in paths.items()
+                if not self._should_exclude(path)
+            }
+        else:
+            return [path for path in paths if not self._should_exclude(path)]
+
+
+def _pyarrow_fs_copy_files(
+    source, destination, source_filesystem=None, destination_filesystem=None, **kwargs
+):
+    if S3FileSystem and isinstance(destination_filesystem, pyarrow.fs.S3FileSystem):
+        # Workaround multi-threading issue with pyarrow. Note that use_threads=True
+        # is safe for download, just not for uploads, see:
+        # https://github.com/apache/arrow/issues/32372
+        kwargs.setdefault("use_threads", False)
+
+    # Use a large chunk size to speed up large checkpoint transfers.
+    kwargs.setdefault("chunk_size", 64 * 1024 * 1024)
+
+    return pyarrow.fs.copy_files(
+        source,
+        destination,
+        source_filesystem=source_filesystem,
+        destination_filesystem=destination_filesystem,
+        **kwargs,
+    )
+
+
+# TODO(justinvyu): Add unit tests for all these utils.
+
+
+def _delete_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str):
+    is_dir = _is_directory(fs, fs_path)
+
+    try:
+        if is_dir:
+            fs.delete_dir(fs_path)
+        else:
+            fs.delete_file(fs_path)
+    except Exception:
+        logger.exception(f"Caught exception when deleting path at ({fs}, {fs_path}):")
+
+
+def _download_from_fs_path(
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    local_path: str,
+    filelock: bool = True,
+):
+    """Downloads a directory or file from (fs, fs_path) to a local path.
+
+    If fs_path points to a directory:
+    - The full directory contents are downloaded directly into `local_path`,
+      rather than to a subdirectory of `local_path`.
+
+    If fs_path points to a file:
+    - The file is downloaded to `local_path`, which is expected to be a file path.
+
+    If the download fails, the `local_path` contents are
+    cleaned up before raising, if the directory did not previously exist.
+
+    NOTE: This method creates `local_path`'s parent directories if they do not
+    already exist. If the download fails, this does NOT clean up all the parent
+    directories that were created.
+
+    Args:
+        fs: The filesystem to download from.
+        fs_path: The filesystem path (either a directory or a file) to download.
+        local_path: The local path to download to.
+        filelock: Whether to require a file lock before downloading, useful for
+            multiple downloads to the same directory that may be happening in parallel.
+
+    Raises:
+        FileNotFoundError: if (fs, fs_path) doesn't exist.
+    """
+
+    _local_path = Path(local_path).resolve()
+    exists_before = _local_path.exists()
+    if _is_directory(fs=fs, fs_path=fs_path):
+        _local_path.mkdir(parents=True, exist_ok=True)
+    else:
+        _local_path.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        if filelock:
+            with TempFileLock(f"{os.path.normpath(local_path)}.lock"):
+                _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
+        else:
+            _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
+    except Exception as e:
+        # Clean up the directory if downloading was unsuccessful
+        if not exists_before:
+            shutil.rmtree(local_path, ignore_errors=True)
+        raise e
+
+
+def _upload_to_fs_path(
+    local_path: str,
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    exclude: Optional[List[str]] = None,
+) -> None:
+    """Uploads a local directory or file to (fs, fs_path).
+
+    NOTE: This will create all necessary parent directories at the destination.
+
+    Args:
+        local_path: The local path to upload.
+        fs: The filesystem to upload to.
+        fs_path: The filesystem path where the dir/file will be uploaded to.
+        exclude: A list of filename matches to exclude from upload. This includes
+            all files under subdirectories as well.
+            This pattern will match with the relative paths of all files under
+            `local_path`.
+            Ex: ["*.png"] to exclude all .png images.
+    """
+
+    if not exclude:
+        # TODO(justinvyu): uploading a single file doesn't work
+        # (since we always create a directory at fs_path)
+        _create_directory(fs=fs, fs_path=fs_path)
+        _pyarrow_fs_copy_files(local_path, fs_path, destination_filesystem=fs)
+        return
+
+    _upload_to_uri_with_exclude_fsspec(
+        local_path=local_path, fs=fs, fs_path=fs_path, exclude=exclude
+    )
+
+
+def _upload_to_uri_with_exclude_fsspec(
+    local_path: str, fs: "pyarrow.fs", fs_path: str, exclude: Optional[List[str]]
+) -> None:
+    local_fs = _ExcludingLocalFilesystem(root_path=local_path, exclude=exclude)
+    handler = pyarrow.fs.FSSpecHandler(local_fs)
+    source_fs = pyarrow.fs.PyFileSystem(handler)
+
+    _create_directory(fs=fs, fs_path=fs_path)
+    _pyarrow_fs_copy_files(
+        local_path, fs_path, source_filesystem=source_fs, destination_filesystem=fs
+    )
+
+
+def _list_at_fs_path(
+    fs: pyarrow.fs.FileSystem,
+    fs_path: str,
+    file_filter: Optional[Callable[[pyarrow.fs.FileInfo], bool]] = None,
+) -> List[str]:
+    """Returns the list of filenames at (fs, fs_path), similar to os.listdir.
+
+    If the path doesn't exist, returns an empty list.
+    """
+    if file_filter is None:
+        file_filter = lambda x: True  # noqa: E731
+
+    selector = pyarrow.fs.FileSelector(fs_path, allow_not_found=True, recursive=False)
+    return [
+        os.path.relpath(file_info.path.lstrip("/"), start=fs_path.lstrip("/"))
+        for file_info in fs.get_file_info(selector)
+        if file_filter(file_info)
+    ]
+
+
+def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool:
+    """Returns True if (fs, fs_path) exists."""
+
+    valid = fs.get_file_info(fs_path)
+    return valid.type != pyarrow.fs.FileType.NotFound
+
+
+def _is_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool:
+    """Checks if (fs, fs_path) is a directory or a file.
+
+    Raises:
+        FileNotFoundError: if (fs, fs_path) doesn't exist.
+    """
+
+    file_info = fs.get_file_info(fs_path)
+    if file_info.type == pyarrow.fs.FileType.NotFound:
+        raise FileNotFoundError(f"Path not found: ({fs}, {fs_path})")
+
+    return not file_info.is_file
+
+
+def _create_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> None:
+    """Create directory at (fs, fs_path).
+
+    Some external filesystems require directories to already exist, or at least
+    the `netloc` to be created (e.g. PyArrows ``mock://`` filesystem).
+
+    Generally this should be done before and outside of Ray applications. This
+    utility is thus primarily used in testing, e.g. of ``mock://` URIs.
+    """
+    try:
+        fs.create_dir(fs_path)
+    except Exception:
+        logger.exception(
+            f"Caught exception when creating directory at ({fs}, {fs_path}):"
+        )
+
+
+def get_fs_and_path(
+    storage_path: Union[str, os.PathLike],
+    storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+) -> Tuple[pyarrow.fs.FileSystem, str]:
+    """Returns the fs and path from a storage path and an optional custom fs.
+
+    Args:
+        storage_path: A storage path or URI. (ex: s3://bucket/path or /tmp/ray_results)
+        storage_filesystem: A custom filesystem to use. If not provided,
+            this will be auto-resolved by pyarrow. If provided, the storage_path
+            is assumed to be prefix-stripped already, and must be a valid path
+            on the filesystem.
+    """
+    storage_path = str(storage_path)
+
+    if storage_filesystem:
+        return storage_filesystem, storage_path
+
+    return pyarrow.fs.FileSystem.from_uri(storage_path)
+
+
+class _FilesystemSyncer(_BackgroundSyncer):
+    """Syncer between local filesystem and a `storage_filesystem`."""
+
+    def __init__(self, storage_filesystem: Optional["pyarrow.fs.FileSystem"], **kwargs):
+        self.storage_filesystem = storage_filesystem
+        super().__init__(**kwargs)
+
+    def _sync_up_command(
+        self, local_path: str, uri: str, exclude: Optional[List] = None
+    ) -> Tuple[Callable, Dict]:
+        # TODO(justinvyu): Defer this cleanup up as part of the
+        # external-facing Syncer deprecation.
+        fs_path = uri
+        return (
+            _upload_to_fs_path,
+            dict(
+                local_path=local_path,
+                fs=self.storage_filesystem,
+                fs_path=fs_path,
+                exclude=exclude,
+            ),
+        )
+
+    def _sync_down_command(self, uri: str, local_path: str) -> Tuple[Callable, Dict]:
+        fs_path = uri
+        return (
+            _download_from_fs_path,
+            dict(
+                fs=self.storage_filesystem,
+                fs_path=fs_path,
+                local_path=local_path,
+            ),
+        )
+
+    def _delete_command(self, uri: str) -> Tuple[Callable, Dict]:
+        fs_path = uri
+        return _delete_fs_path, dict(fs=self.storage_filesystem, fs_path=fs_path)
+
+
+@DeveloperAPI
+class StorageContext:
+    """Shared context that holds the source of truth for all paths and
+    storage utilities, passed along from the driver to workers.
+
+    This object defines a few types of paths:
+    1. *_fs_path: A path on the `storage_filesystem`. This is a regular path
+        which has been prefix-stripped by pyarrow.fs.FileSystem.from_uri and
+        can be joined with `Path(...).as_posix()`.
+    2. *_driver_staging_path: The temporary staging directory on the local filesystem
+        where driver artifacts are saved to before persisting them to storage.
+    3. trial_working_directory: The local filesystem path that the remote
+        actors' working directories are moved to by default.
+        This is separated from the driver staging path so that driver syncing
+        does not implicitly upload the trial working directory, for trials on the
+        driver node.
+
+    Example with storage_path="mock:///bucket/path?param=1":
+
+        >>> import ray
+        >>> from ray.train._internal.storage import StorageContext
+        >>> import os
+        >>> _ = ray.init()
+        >>> storage = StorageContext(
+        ...     storage_path="mock://netloc/bucket/path?param=1",
+        ...     experiment_dir_name="exp_name",
+        ... )
+        >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
+        <pyarrow._fs._MockFileSystem object...
+        >>> storage.experiment_fs_path
+        'bucket/path/exp_name'
+        >>> storage.experiment_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts'
+        >>> storage.trial_dir_name = "trial_dir"
+        >>> storage.trial_fs_path
+        'bucket/path/exp_name/trial_dir'
+        >>> storage.trial_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts/trial_dir'
+        >>> storage.trial_working_directory   # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/working_dirs/trial_dir'
+        >>> storage.current_checkpoint_index = 1
+        >>> storage.checkpoint_fs_path
+        'bucket/path/exp_name/trial_dir/checkpoint_000001'
+        >>> ray.shutdown()
+
+    Example with storage_path="/tmp/ray_results":
+
+        >>> from ray.train._internal.storage import StorageContext
+        >>> storage = StorageContext(
+        ...     storage_path="/tmp/ray_results",
+        ...     experiment_dir_name="exp_name",
+        ... )
+        >>> storage.storage_fs_path
+        '/tmp/ray_results'
+        >>> storage.experiment_fs_path
+        '/tmp/ray_results/exp_name'
+        >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
+        <pyarrow._fs.LocalFileSystem object...
+
+    Internal Usage Examples:
+    - To copy files to the trial directory on the storage filesystem:
+
+        pyarrow.fs.copy_files(
+            local_dir,
+            Path(storage.trial_fs_path, "subdir").as_posix(),
+            destination_filesystem=storage.filesystem
+        )
+
+    .. warning::
+        This is an experimental developer API and is subject to change
+        without notice between versions.
+    """
+
+    def __init__(
+        self,
+        storage_path: Union[str, os.PathLike],
+        experiment_dir_name: str,
+        sync_config: Optional[SyncConfig] = None,
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+        trial_dir_name: Optional[str] = None,
+        current_checkpoint_index: int = -1,
+    ):
+        from ray.tune.utils import date_str
+
+        self.custom_fs_provided = storage_filesystem is not None
+
+        # Invariant: (`storage_filesystem`, `storage_path`) is the location where
+        # *all* results can be accessed.
+        self.experiment_dir_name = experiment_dir_name
+        self.trial_dir_name = trial_dir_name
+        self.current_checkpoint_index = current_checkpoint_index
+        self.sync_config = sync_config or SyncConfig()
+
+        self.storage_filesystem, self.storage_fs_path = get_fs_and_path(
+            storage_path, storage_filesystem
+        )
+        self.storage_fs_path = Path(self.storage_fs_path).as_posix()
+
+        self.syncer: Syncer = _FilesystemSyncer(
+            storage_filesystem=self.storage_filesystem,
+            sync_period=self.sync_config.sync_period,
+            sync_timeout=self.sync_config.sync_timeout,
+        )
+
+        self._create_validation_file()
+        self._check_validation_file()
+
+        # Timestamp is used to create a unique session directory for the current
+        # training job. This is used to avoid conflicts when multiple training jobs
+        # run with the same name in the same cluster.
+        # This is set ONCE at the creation of the storage context, on the driver.
+        self._timestamp = date_str()
+
+    def __str__(self):
+        return (
+            "StorageContext<\n"
+            f"  storage_filesystem='{self.storage_filesystem.type_name}',\n"
+            f"  storage_fs_path='{self.storage_fs_path}',\n"
+            f"  experiment_dir_name='{self.experiment_dir_name}',\n"
+            f"  trial_dir_name='{self.trial_dir_name}',\n"
+            f"  current_checkpoint_index={self.current_checkpoint_index},\n"
+            ">"
+        )
+
+    def _create_validation_file(self):
+        """On the creation of a storage context, create a validation file at the
+        storage path to verify that the storage path can be written to.
+        This validation file is also used to check whether the storage path is
+        accessible by all nodes in the cluster."""
+        valid_file = Path(
+            self.experiment_fs_path, _VALIDATE_STORAGE_MARKER_FILENAME
+        ).as_posix()
+        self.storage_filesystem.create_dir(self.experiment_fs_path)
+        with self.storage_filesystem.open_output_stream(valid_file):
+            pass
+
+    def _check_validation_file(self):
+        """Checks that the validation file exists at the storage path."""
+        valid_file = Path(
+            self.experiment_fs_path, _VALIDATE_STORAGE_MARKER_FILENAME
+        ).as_posix()
+        if not _exists_at_fs_path(fs=self.storage_filesystem, fs_path=valid_file):
+            raise RuntimeError(
+                f"Unable to set up cluster storage with the following settings:\n{self}"
+                "\nCheck that all nodes in the cluster have read/write access "
+                "to the configured storage path. `RunConfig(storage_path)` should be "
+                "set to a cloud storage URI or a shared filesystem path accessible "
+                "by all nodes in your cluster ('s3://bucket' or '/mnt/nfs'). "
+                "A local path on the head node is not accessible by worker nodes. "
+                "See: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html"  # noqa: E501
+            )
+
+    def _update_checkpoint_index(self, metrics: Dict):
+        # Per default, increase by 1. This can be overwritten to customize checkpoint
+        # directories.
+        self.current_checkpoint_index += 1
+
+    def persist_current_checkpoint(self, checkpoint: "Checkpoint") -> "Checkpoint":
+        """Persists a given checkpoint to the current checkpoint path on the filesystem.
+
+        "Current" is defined by the `current_checkpoint_index` attribute of the
+        storage context.
+
+        This method copies the checkpoint files to the storage location.
+        It's up to the user to delete the original checkpoint files if desired.
+
+        For example, the original directory is typically a local temp directory.
+
+        Args:
+            checkpoint: The checkpoint to persist to (fs, checkpoint_fs_path).
+
+        Returns:
+            Checkpoint: A Checkpoint pointing to the persisted checkpoint location.
+        """
+        # TODO(justinvyu): Fix this cyclical import.
+        from ray.train._checkpoint import Checkpoint
+
+        logger.debug(
+            "Copying checkpoint files to storage path:\n"
+            "({source_fs}, {source}) -> ({dest_fs}, {destination})".format(
+                source=checkpoint.path,
+                destination=self.checkpoint_fs_path,
+                source_fs=checkpoint.filesystem,
+                dest_fs=self.storage_filesystem,
+            )
+        )
+
+        # Raise an error if the storage path is not accessible when
+        # attempting to upload a checkpoint from a remote worker.
+        # Ex: If storage_path is a local path, then a validation marker
+        # will only exist on the head node but not the worker nodes.
+        self._check_validation_file()
+
+        self.storage_filesystem.create_dir(self.checkpoint_fs_path)
+        _pyarrow_fs_copy_files(
+            source=checkpoint.path,
+            destination=self.checkpoint_fs_path,
+            source_filesystem=checkpoint.filesystem,
+            destination_filesystem=self.storage_filesystem,
+        )
+
+        persisted_checkpoint = Checkpoint(
+            filesystem=self.storage_filesystem,
+            path=self.checkpoint_fs_path,
+        )
+        logger.info(f"Checkpoint successfully created at: {persisted_checkpoint}")
+        return persisted_checkpoint
+
+    def persist_artifacts(self, force: bool = False) -> None:
+        """Persists all artifacts within `trial_local_dir` to storage.
+
+        This method possibly launches a background task to sync the trial dir,
+        depending on the `sync_period` + `sync_artifacts_on_checkpoint`
+        settings of `SyncConfig`.
+
+        `(local_fs, trial_working_dir) -> (storage_filesystem, trial_fs_path)`
+
+        Args:
+            force: If True, wait for a previous sync to finish, launch a new one,
+                and wait for that one to finish. By the end of a `force=True` call, the
+                latest version of the trial artifacts will be persisted.
+        """
+        if not self.sync_config.sync_artifacts:
+            return
+
+        # Skip if there are no artifacts to sync
+        is_empty = not any(os.scandir(self.trial_working_directory))
+        if is_empty:
+            return
+
+        if force:
+            self.syncer.wait()
+            self.syncer.sync_up(
+                local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path
+            )
+            self.syncer.wait()
+        else:
+            self.syncer.sync_up_if_needed(
+                local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path
+            )
+
+    @property
+    def experiment_fs_path(self) -> str:
+        """The path on the `storage_filesystem` to the experiment directory.
+
+        NOTE: This does not have a URI prefix anymore, since it has been stripped
+        by pyarrow.fs.FileSystem.from_uri already. The URI scheme information is
+        kept in `storage_filesystem` instead.
+        """
+        return Path(self.storage_fs_path, self.experiment_dir_name).as_posix()
+
+    def _get_session_path(self) -> str:
+        """The Ray Train/Tune session local directory used to stage files
+        before persisting to the storage filesystem."""
+        return Path(
+            _get_ray_train_session_dir(), self._timestamp, self.experiment_dir_name
+        ).as_posix()
+
+    @property
+    def experiment_driver_staging_path(self) -> str:
+        """The local filesystem path of the experiment directory on the driver node.
+
+        The driver is the node where `Trainer.fit`/`Tuner.fit` is being called.
+
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/driver_artifacts`
+
+        This should be used as the temporary staging location for files *on the driver*
+        before syncing them to `experiment_fs_path`.
+        For example, the search algorithm should dump its state to this directory.
+        See `trial_driver_staging_path` for writing trial-specific artifacts.
+
+        The directory is synced to
+        `{storage_path}/{experiment_dir_name}` periodically.
+        See `_ExperimentCheckpointManager.checkpoint` for where that happens.
+        """
+        return Path(self._get_session_path(), "driver_artifacts").as_posix()
+
+    @property
+    def trial_fs_path(self) -> str:
+        """The trial directory path on the `storage_filesystem`.
+
+        Raises a ValueError if `trial_dir_name` is not set beforehand.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Should not access `trial_fs_path` without setting `trial_dir_name`"
+            )
+        return Path(self.experiment_fs_path, self.trial_dir_name).as_posix()
+
+    @property
+    def trial_driver_staging_path(self) -> str:
+        """The local filesystem path of the trial directory on the driver.
+
+        The driver is the node where `Trainer.fit`/`Tuner.fit` is being called.
+
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/driver_artifacts/<trial_dir_name>`
+
+        This should be used as the temporary location for files on the driver
+        before persisting them to `trial_fs_path`.
+
+        For example, callbacks (e.g., JsonLoggerCallback) should write trial-specific
+        logfiles within this directory.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Should not access `trial_driver_staging_path` "
+                "without setting `trial_dir_name`"
+            )
+        return Path(self.experiment_driver_staging_path, self.trial_dir_name).as_posix()
+
+    @property
+    def trial_working_directory(self) -> str:
+        """The local filesystem path to trial working directory.
+
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/working_dirs/<trial_dir_name>`
+
+        Ray Train/Tune moves the remote actor's working directory to this path
+        by default, unless disabled by `RAY_CHDIR_TO_TRIAL_DIR` environment variable.
+
+        Writing files to this directory allows users to persist training artifacts
+        if `SyncConfig(sync_artifacts=True)` is set.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Cannot access `trial_working_directory` without "
+                "setting `trial_dir_name`"
+            )
+        return Path(
+            self._get_session_path(), "working_dirs", self.trial_dir_name
+        ).as_posix()
+
+    @property
+    def checkpoint_fs_path(self) -> str:
+        """The current checkpoint directory path on the `storage_filesystem`.
+
+        "Current" refers to the checkpoint that is currently being created/persisted.
+        The user of this class is responsible for setting the `current_checkpoint_index`
+        (e.g., incrementing when needed).
+        """
+        return Path(self.trial_fs_path, self.checkpoint_dir_name).as_posix()
+
+    @property
+    def checkpoint_dir_name(self) -> str:
+        """The current checkpoint directory name, based on the checkpoint index."""
+        return StorageContext._make_checkpoint_dir_name(self.current_checkpoint_index)
+
+    @staticmethod
+    def get_experiment_dir_name(run_obj: Union[str, Callable, Type]) -> str:
+        from ray.tune.experiment import Experiment
+        from ray.tune.utils import date_str
+
+        run_identifier = Experiment.get_trainable_name(run_obj)
+
+        if bool(int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0))):
+            dir_name = run_identifier
+        else:
+            dir_name = "{}_{}".format(run_identifier, date_str())
+        return dir_name
+
+    @staticmethod
+    def _make_checkpoint_dir_name(index: int):
+        """Get the name of the checkpoint directory, given an index."""
+        return f"checkpoint_{index:06d}"
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4413e92452950508f9cbedc1816defd3ae7a97a3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py
@@ -0,0 +1,490 @@
+import abc
+import logging
+import threading
+import time
+import traceback
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from ray._private.thirdparty.tabulate.tabulate import tabulate
+from ray.train.constants import _DEPRECATED_VALUE
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.widgets import Template
+
+logger = logging.getLogger(__name__)
+
+# Syncing period for syncing checkpoints between nodes or to cloud.
+DEFAULT_SYNC_PERIOD = 300
+
+# Default sync timeout after which syncing processes are aborted
+DEFAULT_SYNC_TIMEOUT = 1800
+
+
+@PublicAPI(stability="stable")
+@dataclass
+class SyncConfig:
+    """Configuration object for Train/Tune file syncing to `RunConfig(storage_path)`.
+
+    In Ray Train/Tune, here is where syncing (mainly uploading) happens:
+
+    The experiment driver (on the head node) syncs the experiment directory to storage
+    (which includes experiment state such as searcher state, the list of trials
+    and their statuses, and trial metadata).
+
+    It's also possible to sync artifacts from the trial directory to storage
+    by setting `sync_artifacts=True`.
+    For a Ray Tune run with many trials, each trial will upload its trial directory
+    to storage, which includes arbitrary files that you dumped during the run.
+    For a Ray Train run doing distributed training, each remote worker will similarly
+    upload its trial directory to storage.
+
+    See :ref:`persistent-storage-guide` for more details and examples.
+
+    Args:
+        sync_period: Minimum time in seconds to wait between two sync operations.
+            A smaller ``sync_period`` will have the data in storage updated more often
+            but introduces more syncing overhead. Defaults to 5 minutes.
+        sync_timeout: Maximum time in seconds to wait for a sync process
+            to finish running. A sync operation will run for at most this long
+            before raising a `TimeoutError`. Defaults to 30 minutes.
+        sync_artifacts: [Beta] Whether or not to sync artifacts that are saved to the
+            trial directory (accessed via `train.get_context().get_trial_dir()`)
+            to the persistent storage configured via `train.RunConfig(storage_path)`.
+            The trial or remote worker will try to launch an artifact syncing
+            operation every time `train.report` happens, subject to `sync_period`
+            and `sync_artifacts_on_checkpoint`.
+            Defaults to False -- no artifacts are persisted by default.
+        sync_artifacts_on_checkpoint: If True, trial/worker artifacts are
+            forcefully synced on every reported checkpoint.
+            This only has an effect if `sync_artifacts` is True.
+            Defaults to True.
+    """
+
+    sync_period: int = DEFAULT_SYNC_PERIOD
+    sync_timeout: int = DEFAULT_SYNC_TIMEOUT
+    sync_artifacts: bool = False
+    sync_artifacts_on_checkpoint: bool = True
+    upload_dir: Optional[str] = _DEPRECATED_VALUE
+    syncer: Optional[Union[str, "Syncer"]] = _DEPRECATED_VALUE
+    sync_on_checkpoint: bool = _DEPRECATED_VALUE
+
+    # TODO(justinvyu): [Deprecated] Remove in 2.11.
+    def _deprecation_warning(self, attr_name: str, extra_msg: str):
+        if getattr(self, attr_name) != _DEPRECATED_VALUE:
+            raise DeprecationWarning(
+                f"`SyncConfig({attr_name})` is a deprecated configuration "
+                "Please remove it from your `SyncConfig`. "
+                f"{extra_msg}"
+            )
+
+    def __post_init__(self):
+        for attr_name, extra_msg in [
+            (
+                "upload_dir",
+                "\nPlease specify `ray.train.RunConfig(storage_path)` instead.",
+            ),
+            (
+                "syncer",
+                "\nPlease implement custom syncing logic with a custom "
+                "`pyarrow.fs.FileSystem` instead, and pass it into "
+                "`ray.train.RunConfig(storage_filesystem)`. "
+                "See here: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html#custom-storage",  # noqa: E501
+            ),
+            ("sync_on_checkpoint", ""),
+        ]:
+            self._deprecation_warning(attr_name, extra_msg)
+
+    def _repr_html_(self) -> str:
+        """Generate an HTML representation of the SyncConfig."""
+        return Template("scrollableTable.html.j2").render(
+            table=tabulate(
+                {
+                    "Setting": ["Sync period", "Sync timeout"],
+                    "Value": [self.sync_period, self.sync_timeout],
+                },
+                tablefmt="html",
+                showindex=False,
+                headers="keys",
+            ),
+            max_height="none",
+        )
+
+
+class _BackgroundProcess:
+    def __init__(self, fn: Callable):
+        self._fn = fn
+        self._process = None
+        self._result = {}
+        self._start_time = float("-inf")
+
+    @property
+    def is_running(self):
+        return self._process and self._process.is_alive()
+
+    @property
+    def start_time(self):
+        return self._start_time
+
+    def start(self, *args, **kwargs):
+        if self.is_running:
+            return False
+
+        self._result = {}
+
+        def entrypoint():
+            try:
+                result = self._fn(*args, **kwargs)
+            except Exception as e:
+                self._result["exception"] = e
+                return
+
+            self._result["result"] = result
+
+        self._process = threading.Thread(target=entrypoint)
+        self._process.daemon = True
+        self._process.start()
+        self._start_time = time.time()
+
+    def wait(self, timeout: Optional[float] = None) -> Any:
+        """Waits for the background process to finish running. Waits until the
+        background process has run for at least `timeout` seconds, counting from
+        the time when the process was started."""
+        if not self._process:
+            return None
+
+        time_remaining = None
+        if timeout:
+            elapsed = time.time() - self.start_time
+            time_remaining = max(timeout - elapsed, 0)
+
+        self._process.join(timeout=time_remaining)
+
+        if self._process.is_alive():
+            self._process = None
+            raise TimeoutError(
+                f"{getattr(self._fn, '__name__', str(self._fn))} did not finish "
+                f"running within the timeout of {timeout} seconds."
+            )
+
+        self._process = None
+
+        exception = self._result.get("exception")
+        if exception:
+            raise exception
+
+        result = self._result.get("result")
+
+        self._result = {}
+        return result
+
+
+@DeveloperAPI
+class Syncer(abc.ABC):
+    """Syncer class for synchronizing data between Ray nodes and remote (cloud) storage.
+
+    This class handles data transfer for two cases:
+
+    1. Synchronizing data such as experiment state snapshots from the driver to
+       cloud storage.
+    2. Synchronizing data such as trial checkpoints from remote trainables to
+       cloud storage.
+
+    Synchronizing tasks are usually asynchronous and can be awaited using ``wait()``.
+    The base class implements a ``wait_or_retry()`` API that will retry a failed
+    sync command.
+
+    The base class also exposes an API to only kick off syncs every ``sync_period``
+    seconds.
+
+    Args:
+        sync_period: The minimum time in seconds between sync operations, as
+            used by ``sync_up/down_if_needed``.
+        sync_timeout: The maximum time to wait for a sync process to finish before
+            issuing a new sync operation. Ex: should be used by ``wait`` if launching
+            asynchronous sync tasks.
+    """
+
+    def __init__(
+        self,
+        sync_period: float = DEFAULT_SYNC_PERIOD,
+        sync_timeout: float = DEFAULT_SYNC_TIMEOUT,
+    ):
+        self.sync_period = sync_period
+        self.sync_timeout = sync_timeout
+        self.last_sync_up_time = float("-inf")
+        self.last_sync_down_time = float("-inf")
+
+    @abc.abstractmethod
+    def sync_up(
+        self, local_dir: str, remote_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        """Synchronize local directory to remote directory.
+
+        This function can spawn an asynchronous process that can be awaited in
+        ``wait()``.
+
+        Args:
+            local_dir: Local directory to sync from.
+            remote_dir: Remote directory to sync up to. This is an URI
+                (``protocol://remote/path``).
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+
+        Returns:
+            True if sync process has been spawned, False otherwise.
+
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def sync_down(
+        self, remote_dir: str, local_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        """Synchronize remote directory to local directory.
+
+        This function can spawn an asynchronous process that can be awaited in
+        ``wait()``.
+
+        Args:
+            remote_dir: Remote directory to sync down from. This is an URI
+                (``protocol://remote/path``).
+            local_dir: Local directory to sync to.
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+
+        Returns:
+            True if sync process has been spawned, False otherwise.
+
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def delete(self, remote_dir: str) -> bool:
+        """Delete directory on remote storage.
+
+        This function can spawn an asynchronous process that can be awaited in
+        ``wait()``.
+
+        Args:
+            remote_dir: Remote directory to delete. This is an URI
+                (``protocol://remote/path``).
+
+        Returns:
+            True if sync process has been spawned, False otherwise.
+
+        """
+        raise NotImplementedError
+
+    def retry(self):
+        """Retry the last sync up, sync down, or delete command.
+
+        You should implement this method if you spawn asynchronous syncing
+        processes.
+        """
+        pass
+
+    def wait(self, timeout: Optional[float] = None):
+        """Wait for asynchronous sync command to finish.
+
+        You should implement this method if you spawn asynchronous syncing
+        processes. This method should timeout after the asynchronous command
+        has run for `sync_timeout` seconds and raise a `TimeoutError`.
+        """
+        pass
+
+    def sync_up_if_needed(
+        self, local_dir: str, remote_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        """Syncs up if time since last sync up is greater than sync_period.
+
+        Args:
+            local_dir: Local directory to sync from.
+            remote_dir: Remote directory to sync up to. This is an URI
+                (``protocol://remote/path``).
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+        """
+        now = time.time()
+        if now - self.last_sync_up_time >= self.sync_period:
+            result = self.sync_up(
+                local_dir=local_dir, remote_dir=remote_dir, exclude=exclude
+            )
+            self.last_sync_up_time = now
+            return result
+
+    def sync_down_if_needed(
+        self, remote_dir: str, local_dir: str, exclude: Optional[List] = None
+    ):
+        """Syncs down if time since last sync down is greater than sync_period.
+
+        Args:
+            remote_dir: Remote directory to sync down from. This is an URI
+                (``protocol://remote/path``).
+            local_dir: Local directory to sync to.
+            exclude: Pattern of files to exclude, e.g.
+                ``["*/checkpoint_*]`` to exclude trial checkpoints.
+        """
+        now = time.time()
+        if now - self.last_sync_down_time >= self.sync_period:
+            result = self.sync_down(
+                remote_dir=remote_dir, local_dir=local_dir, exclude=exclude
+            )
+            self.last_sync_down_time = now
+            return result
+
+    def wait_or_retry(self, max_retries: int = 2, backoff_s: int = 5):
+        assert max_retries > 0
+        last_error_traceback = None
+        for i in range(max_retries + 1):
+            try:
+                self.wait()
+            except Exception as e:
+                attempts_remaining = max_retries - i
+
+                # If we're out of retries, then save the full traceback of the last
+                # error and show it when raising an exception.
+                if attempts_remaining == 0:
+                    last_error_traceback = traceback.format_exc()
+                    break
+
+                logger.error(
+                    f"The latest sync operation failed with the following error: "
+                    f"{repr(e)}\n"
+                    f"Retrying {attempts_remaining} more time(s) after sleeping "
+                    f"for {backoff_s} seconds..."
+                )
+                time.sleep(backoff_s)
+                self.retry()
+                continue
+            # Succeeded!
+            return
+        raise RuntimeError(
+            f"Failed sync even after {max_retries} retries. "
+            f"The latest sync failed with the following error:\n{last_error_traceback}"
+        )
+
+    def reset(self):
+        self.last_sync_up_time = float("-inf")
+        self.last_sync_down_time = float("-inf")
+
+    def close(self):
+        pass
+
+    def _repr_html_(self) -> str:
+        return
+
+
+class _BackgroundSyncer(Syncer):
+    """Syncer using a background process for asynchronous file transfer."""
+
+    def __init__(
+        self,
+        sync_period: float = DEFAULT_SYNC_PERIOD,
+        sync_timeout: float = DEFAULT_SYNC_TIMEOUT,
+    ):
+        super(_BackgroundSyncer, self).__init__(
+            sync_period=sync_period, sync_timeout=sync_timeout
+        )
+        self._sync_process = None
+        self._current_cmd = None
+
+    def _should_continue_existing_sync(self):
+        """Returns whether a previous sync is still running within the timeout."""
+        return (
+            self._sync_process
+            and self._sync_process.is_running
+            and time.time() - self._sync_process.start_time < self.sync_timeout
+        )
+
+    def _launch_sync_process(self, sync_command: Tuple[Callable, Dict]):
+        """Waits for the previous sync process to finish,
+        then launches a new process that runs the given command."""
+        if self._sync_process:
+            try:
+                self.wait()
+            except Exception:
+                logger.warning(
+                    f"Last sync command failed with the following error:\n"
+                    f"{traceback.format_exc()}"
+                )
+
+        self._current_cmd = sync_command
+        self.retry()
+
+    def sync_up(
+        self, local_dir: str, remote_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        if self._should_continue_existing_sync():
+            logger.debug(
+                f"Last sync still in progress, "
+                f"skipping sync up of {local_dir} to {remote_dir}"
+            )
+            return False
+
+        sync_up_cmd = self._sync_up_command(
+            local_path=local_dir, uri=remote_dir, exclude=exclude
+        )
+        self._launch_sync_process(sync_up_cmd)
+
+        return True
+
+    def _sync_up_command(
+        self, local_path: str, uri: str, exclude: Optional[List] = None
+    ) -> Tuple[Callable, Dict]:
+        raise NotImplementedError
+
+    def sync_down(
+        self, remote_dir: str, local_dir: str, exclude: Optional[List] = None
+    ) -> bool:
+        if self._should_continue_existing_sync():
+            logger.warning(
+                f"Last sync still in progress, "
+                f"skipping sync down of {remote_dir} to {local_dir}"
+            )
+            return False
+
+        sync_down_cmd = self._sync_down_command(uri=remote_dir, local_path=local_dir)
+        self._launch_sync_process(sync_down_cmd)
+
+        return True
+
+    def _sync_down_command(self, uri: str, local_path: str) -> Tuple[Callable, Dict]:
+        raise NotImplementedError
+
+    def delete(self, remote_dir: str) -> bool:
+        if self._should_continue_existing_sync():
+            logger.warning(
+                f"Last sync still in progress, skipping deletion of {remote_dir}"
+            )
+            return False
+
+        delete_cmd = self._delete_command(uri=remote_dir)
+        self._launch_sync_process(delete_cmd)
+
+        return True
+
+    def _delete_command(self, uri: str) -> Tuple[Callable, Dict]:
+        raise NotImplementedError
+
+    def wait(self, timeout: Optional[float] = None):
+        if self._sync_process:
+            try:
+                self._sync_process.wait(timeout=timeout or self.sync_timeout)
+            except Exception as e:
+                raise e
+            finally:
+                # Regardless of whether the sync process succeeded within the timeout,
+                # clear the sync process so a new one can be created.
+                self._sync_process = None
+
+    def retry(self):
+        if not self._current_cmd:
+            raise RuntimeError("No sync command set, cannot retry.")
+        cmd, kwargs = self._current_cmd
+        self._sync_process = _BackgroundProcess(cmd)
+        self._sync_process.start(**kwargs)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["_sync_process"] = None
+        return state
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9946789ef18a33d506e4be4d811a1f3dd901d8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py
@@ -0,0 +1,239 @@
+import abc
+import functools
+import inspect
+import logging
+import os
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import ray
+from ray.actor import ActorHandle
+from ray.air._internal.util import (
+    StartTraceback,
+    StartTracebackWithWorkerRank,
+    find_free_port,
+)
+from ray.exceptions import RayActorError
+from ray.types import ObjectRef
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+def check_for_failure(
+    remote_values: List[ObjectRef],
+) -> Tuple[bool, Optional[Exception]]:
+    """Check for actor failure when retrieving the remote values.
+
+    Args:
+        remote_values: List of object references from Ray actor methods.
+
+    Returns:
+        A tuple of (bool, Exception). The bool is
+        True if evaluating all object references is successful, False otherwise.
+    """
+    unfinished = remote_values.copy()
+
+    while len(unfinished) > 0:
+        finished, unfinished = ray.wait(unfinished)
+
+        # If a failure occurs the ObjectRef will be marked as finished.
+        # Calling ray.get will expose the failure as a RayActorError.
+        for object_ref in finished:
+            # Everything in finished has either failed or completed
+            # successfully.
+            try:
+                ray.get(object_ref)
+            except RayActorError as exc:
+                failed_actor_rank = remote_values.index(object_ref)
+                logger.info(f"Worker {failed_actor_rank} has failed.")
+                return False, exc
+            except Exception as exc:
+                # Other (e.g. training) errors should be directly raised
+                failed_worker_rank = remote_values.index(object_ref)
+                raise StartTracebackWithWorkerRank(
+                    worker_rank=failed_worker_rank
+                ) from exc
+
+    return True, None
+
+
+def get_address_and_port() -> Tuple[str, int]:
+    """Returns the IP address and a free port on this node."""
+    addr = ray.util.get_node_ip_address()
+    port = find_free_port()
+
+    return addr, port
+
+
+def construct_path(path: Path, parent_path: Path) -> Path:
+    """Constructs a path relative to a parent.
+
+    Args:
+        path: A relative or absolute path.
+        parent_path: A relative path or absolute path.
+
+    Returns: An absolute path.
+    """
+    if path.expanduser().is_absolute():
+        return path.expanduser().resolve()
+    else:
+        return parent_path.joinpath(path).expanduser().resolve()
+
+
+def update_env_vars(env_vars: Dict[str, Any]):
+    """Updates the environment variables on this worker process.
+
+    Args:
+        env_vars: Environment variables to set.
+    """
+    sanitized = {k: str(v) for k, v in env_vars.items()}
+    os.environ.update(sanitized)
+
+
+def count_required_parameters(fn: Callable) -> int:
+    """Counts the number of required parameters of a function.
+
+    NOTE: *args counts as 1 required parameter.
+
+    Examples
+    --------
+
+    >>> def fn(a, b, /, c, *args, d=1, e=2, **kwargs):
+    ...    pass
+    >>> count_required_parameters(fn)
+    4
+
+    >>> fn = lambda: 1
+    >>> count_required_parameters(fn)
+    0
+
+    >>> def fn(config, a, b=1, c=2):
+    ...     pass
+    >>> from functools import partial
+    >>> count_required_parameters(partial(fn, a=0))
+    1
+    """
+    params = inspect.signature(fn).parameters.values()
+
+    positional_param_kinds = {
+        inspect.Parameter.POSITIONAL_ONLY,
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.VAR_POSITIONAL,
+    }
+    return len(
+        [
+            p
+            for p in params
+            if p.default == inspect.Parameter.empty and p.kind in positional_param_kinds
+        ]
+    )
+
+
+def construct_train_func(
+    train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
+    config: Optional[Dict[str, Any]],
+    train_func_context: ContextManager,
+    fn_arg_name: Optional[str] = "train_func",
+    discard_returns: bool = False,
+) -> Callable[[], T]:
+    """Validates and constructs the training function to execute.
+    Args:
+        train_func: The training function to execute.
+            This can either take in no arguments or a ``config`` dict.
+        config (Optional[Dict]): Configurations to pass into
+            ``train_func``. If None then an empty Dict will be created.
+        train_func_context: Context manager for user's `train_func`, which executes
+            backend-specific logic before and after the training function.
+        fn_arg_name (Optional[str]): The name of training function to use for error
+            messages.
+        discard_returns: Whether to discard any returns from train_func or not.
+    Returns:
+        A valid training function.
+    Raises:
+        ValueError: if the input ``train_func`` is invalid.
+    """
+    num_required_params = count_required_parameters(train_func)
+
+    if discard_returns:
+        # Discard any returns from the function so that
+        # BackendExecutor doesn't try to deserialize them.
+        # Those returns are inaccesible with AIR anyway.
+        @functools.wraps(train_func)
+        def discard_return_wrapper(*args, **kwargs):
+            try:
+                train_func(*args, **kwargs)
+            except Exception as e:
+                raise StartTraceback from e
+
+        wrapped_train_func = discard_return_wrapper
+    else:
+        wrapped_train_func = train_func
+
+    if num_required_params > 1:
+        err_msg = (
+            f"{fn_arg_name} should take in 0 or 1 required arguments, but it accepts "
+            f"{num_required_params} required arguments instead."
+        )
+        raise ValueError(err_msg)
+    elif num_required_params == 1:
+        config = {} if config is None else config
+
+        @functools.wraps(wrapped_train_func)
+        def train_fn():
+            try:
+                with train_func_context():
+                    return wrapped_train_func(config)
+            except Exception as e:
+                raise StartTraceback from e
+
+    else:  # num_params == 0
+
+        @functools.wraps(wrapped_train_func)
+        def train_fn():
+            try:
+                with train_func_context():
+                    return wrapped_train_func()
+            except Exception as e:
+                raise StartTraceback from e
+
+    return train_fn
+
+
+class Singleton(abc.ABCMeta):
+    """Singleton Abstract Base Class
+
+    https://stackoverflow.com/questions/33364070/implementing
+    -singleton-as-metaclass-but-for-abstract-classes
+    """
+
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+
+class ActorWrapper:
+    """Wraps an actor to provide same API as using the base class directly."""
+
+    def __init__(self, actor: ActorHandle):
+        self.actor = actor
+
+    def __getattr__(self, item):
+        # The below will fail if trying to access an attribute (not a method) from the
+        # actor.
+        actor_method = getattr(self.actor, item)
+        return lambda *args, **kwargs: ray.get(actor_method.remote(*args, **kwargs))
diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..72da84e3c1058796af6373945d21c4a2c00fe7b4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py
@@ -0,0 +1,426 @@
+import logging
+import os
+import socket
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
+
+import ray
+from ray.actor import ActorHandle
+from ray.air._internal.util import exception_cause, skip_exceptions
+from ray.types import ObjectRef
+from ray.util.placement_group import PlacementGroup
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+class RayTrainWorker:
+    """A class to execute arbitrary functions. Does not hold any state."""
+
+    def __execute(self, func: Callable[..., T], *args, **kwargs) -> T:
+        """Executes the input function and returns the output.
+
+        Args:
+            func: The function to execute.
+            args, kwargs: The arguments to pass into func.
+        """
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            skipped = skip_exceptions(e)
+            raise skipped from exception_cause(skipped)
+
+
+@dataclass
+class WorkerMetadata:
+    """Metadata for each worker/actor.
+
+    This information is expected to stay the same throughout the lifetime of
+    actor.
+
+    Args:
+        node_id: ID of the node this worker is on.
+        node_ip: IP address of the node this worker is on.
+        hostname: Hostname that this worker is on.
+        resource_ids: Map of accelerator resources
+        ("GPU", "neuron_cores", ..) to their IDs.
+        pid: Process ID of this worker.
+    """
+
+    node_id: str
+    node_ip: str
+    hostname: str
+    resource_ids: Dict[str, List[str]]
+    pid: int
+
+
+@dataclass
+class Worker:
+    """Class representing a Worker."""
+
+    actor: ActorHandle
+    metadata: WorkerMetadata
+
+
+def create_executable_class(executable_cls: Optional[Type] = None) -> Type:
+    """Create the executable class to use as the Ray actors."""
+    if not executable_cls:
+        return RayTrainWorker
+    elif issubclass(executable_cls, RayTrainWorker):
+        return executable_cls
+    else:
+
+        class _WrappedExecutable(executable_cls, RayTrainWorker):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+
+        return _WrappedExecutable
+
+
+def construct_metadata() -> WorkerMetadata:
+    """Creates metadata for this worker.
+
+    This function is expected to be run on the actor.
+    """
+    node_id = ray.get_runtime_context().get_node_id()
+    node_ip = ray.util.get_node_ip_address()
+    hostname = socket.gethostname()
+    accelerator_ids = ray.get_runtime_context().get_accelerator_ids()
+    pid = os.getpid()
+
+    return WorkerMetadata(
+        node_id=node_id,
+        node_ip=node_ip,
+        hostname=hostname,
+        resource_ids=accelerator_ids,
+        pid=pid,
+    )
+
+
+class WorkerGroup:
+    """Group of Ray Actors that can execute arbitrary functions.
+
+    ``WorkerGroup`` launches Ray actors according to the given
+    specification. It can then execute arbitrary Python functions in each of
+    these workers.
+
+    If not enough resources are available to launch the actors, the Ray
+    cluster will automatically scale up if autoscaling is enabled.
+
+    Args:
+        num_workers: The number of workers (Ray actors) to launch.
+            Defaults to 1.
+        resources_per_worker (Optional[Dict[str, float]]):
+            Dictionary specifying the resources that will be
+            requested for each worker. Defaults to {"CPU": 1}.
+        actor_cls (Optional[Type]): If specified use this class as the
+            remote actors.
+        remote_cls_args, remote_cls_kwargs: If ``remote_cls`` is provided,
+            these args will be used for the worker initialization.
+        placement_group (PlacementGroup|str): The placement group that workers
+            should be created in. Defaults to "default" which will inherit the
+            parent placement group (if child tasks should be captured).
+
+
+    Example:
+
+    .. code_block:: python
+
+        worker_group = WorkerGroup(num_workers=2)
+        output = worker_group.execute(lambda: 1)
+        assert len(output) == 2
+        assert all(o == 1 for o in output)
+    """
+
+    def __init__(
+        self,
+        num_workers: int = 1,
+        resources_per_worker: Optional[Dict[str, float]] = None,
+        actor_cls: Type = None,
+        actor_cls_args: Optional[Tuple] = None,
+        actor_cls_kwargs: Optional[Dict] = None,
+        placement_group: Union[PlacementGroup, str] = "default",
+    ):
+        if resources_per_worker is None:
+            resources_per_worker = {"CPU": 1}
+        else:
+            resources_per_worker = resources_per_worker.copy()
+
+        if num_workers <= 0:
+            raise ValueError(
+                "The provided `num_workers` must be greater "
+                f"than 0. Received num_workers={num_workers} "
+                f"instead."
+            )
+
+        if any(v < 0 for v in resources_per_worker.values()):
+            raise ValueError(
+                "The number of resources per worker must not be negative. "
+                f"Received resources_per_worker={resources_per_worker}."
+            )
+
+        if (actor_cls_args or actor_cls_kwargs) and not actor_cls:
+            raise ValueError(
+                "`actor_cls_args` or `actor_class_kwargs` are "
+                "passed in but no `actor_cls` is passed in."
+            )
+
+        self.num_workers = num_workers
+        self.num_cpus_per_worker = resources_per_worker.pop("CPU", 0)
+        self.num_gpus_per_worker = resources_per_worker.pop("GPU", 0)
+        self.memory_per_worker = resources_per_worker.pop("memory", 0)
+        self.workers = []
+        self._base_cls = create_executable_class(actor_cls)
+        assert issubclass(self._base_cls, RayTrainWorker)
+
+        self._actor_cls_args = actor_cls_args or []
+        self._actor_cls_kwargs = actor_cls_kwargs or {}
+
+        self._placement_group = placement_group
+
+        # TODO(matt): Validate resources. Fast-fail if it is impossible to
+        #  handle the request, rather than hang indefinitely.
+        self._remote_cls = ray.remote(
+            num_cpus=self.num_cpus_per_worker,
+            num_gpus=self.num_gpus_per_worker,
+            memory=self.memory_per_worker,
+            resources=resources_per_worker,
+        )(self._base_cls)
+        self.start()
+
+    def start(self):
+        """Starts all the workers in this worker group."""
+        if self.workers and len(self.workers) > 0:
+            raise RuntimeError(
+                "The workers have already been started. "
+                "Please call `shutdown` first if you want to "
+                "restart them."
+            )
+
+        logger.debug(f"Starting {self.num_workers} workers.")
+        self.add_workers(self.num_workers)
+        logger.debug(f"{len(self.workers)} workers have successfully started.")
+
+    def shutdown(self, patience_s: float = 5):
+        """Shutdown all the workers in this worker group.
+
+        Args:
+            patience_s: Attempt a graceful shutdown
+                of the workers for this many seconds. Fallback to force kill
+                if graceful shutdown is not complete after this time. If
+                this is less than or equal to 0, immediately force kill all
+                workers.
+        """
+        logger.debug(f"Shutting down {len(self.workers)} workers.")
+        if patience_s <= 0:
+            for worker in self.workers:
+                ray.kill(worker.actor)
+        else:
+            done_refs = [w.actor.__ray_terminate__.remote() for w in self.workers]
+            # Wait for actors to die gracefully.
+            done, not_done = ray.wait(done_refs, timeout=patience_s)
+            if not_done:
+                logger.debug("Graceful termination failed. Falling back to force kill.")
+                # If all actors are not able to die gracefully, then kill them.
+                for worker in self.workers:
+                    ray.kill(worker.actor)
+
+        logger.debug("Shutdown successful.")
+        self.workers = []
+
+    def execute_async(self, func: Callable[..., T], *args, **kwargs) -> List[ObjectRef]:
+        """Execute ``func`` on each worker and return the futures.
+
+        Args:
+            func: A function to call on each worker.
+            args, kwargs: Passed directly into func.
+
+        Returns:
+            (List[ObjectRef]) A list of ``ObjectRef`` representing the
+                output of ``func`` from each worker. The order is the same
+                as ``self.workers``.
+
+        """
+        if len(self.workers) <= 0:
+            raise RuntimeError(
+                "There are no active workers. This worker "
+                "group has most likely been shut down. Please"
+                "create a new WorkerGroup or restart this one."
+            )
+
+        return [
+            w.actor._RayTrainWorker__execute.options(
+                name=f"_RayTrainWorker__execute.{func.__name__}"
+            ).remote(func, *args, **kwargs)
+            for w in self.workers
+        ]
+
+    def execute(self, func: Callable[..., T], *args, **kwargs) -> List[T]:
+        """Execute ``func`` on each worker and return the outputs of ``func``.
+
+        Args:
+            func: A function to call on each worker.
+            args, kwargs: Passed directly into func.
+
+        Returns:
+            (List[T]) A list containing the output of ``func`` from each
+                worker. The order is the same as ``self.workers``.
+
+        """
+        return ray.get(self.execute_async(func, *args, **kwargs))
+
+    def execute_single_async(
+        self, worker_index: int, func: Callable[..., T], *args, **kwargs
+    ) -> ObjectRef:
+        """Execute ``func`` on worker ``worker_index`` and return futures.
+
+        Args:
+            worker_index: The index to execute func on.
+            func: A function to call on the first worker.
+            args, kwargs: Passed directly into func.
+
+        Returns:
+            (ObjectRef) An ObjectRef representing the output of func.
+
+        """
+        if worker_index >= len(self.workers):
+            raise ValueError(
+                f"The provided worker_index {worker_index} is "
+                f"not valid for {self.num_workers} workers."
+            )
+        return (
+            self.workers[worker_index]
+            .actor._RayTrainWorker__execute.options(
+                name=f"_RayTrainWorker__execute.{func.__name__}"
+            )
+            .remote(func, *args, **kwargs)
+        )
+
+    def execute_single(
+        self, worker_index: int, func: Callable[..., T], *args, **kwargs
+    ) -> T:
+        """Execute ``func`` on worker with index ``worker_index``.
+
+        Args:
+            worker_index: The index to execute func on.
+            func: A function to call on the first worker.
+            args, kwargs: Passed directly into func.
+
+        Returns:
+            (T) The output of func.
+
+        """
+
+        return ray.get(self.execute_single_async(worker_index, func, *args, **kwargs))
+
+    def remove_workers(self, worker_indexes: List[int]):
+        """Removes the workers with the specified indexes.
+
+        The removed workers will go out of scope and their actor processes
+        will be terminated.
+
+        Args:
+            worker_indexes (List[int]): The indexes of the workers to remove.
+        """
+        new_workers = []
+        for i in range(len(self.workers)):
+            if i not in worker_indexes:
+                new_workers.append(self.workers[i])
+        self.workers = new_workers
+
+    def add_workers(self, num_workers: int):
+        """Adds ``num_workers`` to this WorkerGroup.
+
+        Note: Adding workers when the cluster/placement group is at capacity
+        may lead to undefined hanging behavior. If you are attempting to
+        replace existing workers in the WorkerGroup, remove_workers() should
+        be called first.
+
+        Args:
+            num_workers: The number of workers to add.
+        """
+        new_actors = []
+        new_actor_metadata = []
+        for _ in range(num_workers):
+            actor = self._remote_cls.options(
+                placement_group=self._placement_group
+            ).remote(*self._actor_cls_args, **self._actor_cls_kwargs)
+            new_actors.append(actor)
+            new_actor_metadata.append(
+                actor._RayTrainWorker__execute.options(
+                    name="_RayTrainWorker__execute.construct_metadata"
+                ).remote(construct_metadata)
+            )
+
+        # Get metadata from all actors.
+        metadata = ray.get(new_actor_metadata)
+
+        for i in range(len(new_actors)):
+            self.workers.append(Worker(actor=new_actors[i], metadata=metadata[i]))
+
+    def sort_workers_by_node_id_and_gpu_id(self, _first_node_id: Optional[str] = None):
+        """Reorder the workers by their node id and the lowest GPU id.
+
+        This is useful for collocating workers on the same node.
+
+        Example:
+            Given workers with the following attributes:
+                worker_0: node_id=1, gpu_ids=[1]
+                worker_1: node_id=0, gpu_ids=[0]
+                worker_2: node_id=1, gpu_ids=[0]
+                worker_3: node_id=0, gpu_ids=[1]
+
+            The function will perform the following steps:
+                1. Group by node ID:
+                    node_id=0: worker_1, worker_3
+                    node_id=1: worker_0, worker_2
+
+                2. Sort each group by GPU ID:
+                    node_id=0: worker_1 (gpu_id=0), worker_3 (gpu_id=1)
+                    node_id=1: worker_2 (gpu_id=0), worker_0 (gpu_id=1)
+
+            Resulting in the order: [worker_1, worker_3, worker_2, worker_0]
+
+        Args:
+            _first_node_id: The first ID to group by.
+                Set this to the node ID of the trainer coordinator to ensure that the
+                rank 0 worker is on the same node, allowing additional resources to
+                be specified for rank 0 workers via
+                `ScalingConfig(trainer_resources=)`.
+        """
+        node_id_to_workers = defaultdict(list)
+
+        if _first_node_id is not None:
+            node_id_to_workers[_first_node_id] = []
+
+        for worker in self.workers:
+            node_id_to_workers[worker.metadata.node_id].append(worker)
+
+        # Sort workers on the same node by the lowest GPU id
+        # More details: https://github.com/ray-project/ray/issues/40803
+        def get_lowest_gpu_id(worker) -> int:
+            gpu_ids = worker.metadata.resource_ids.get("GPU", [])
+            # If there are no GPU IDs, return 0 as a default
+            if not gpu_ids:
+                return 0
+
+            # Attempt to convert GPU IDs to integers and find the minimum ID.
+            # Fallback to return the minimum string-based ID
+            try:
+                return min(int(gpu_id) for gpu_id in gpu_ids)
+            except ValueError:
+                return min(gpu_ids)
+
+        for node_id in node_id_to_workers:
+            node_id_to_workers[node_id].sort(key=get_lowest_gpu_id)
+
+        sorted_workers = []
+        for workers in node_id_to_workers.values():
+            sorted_workers.extend(workers)
+
+        self.workers = sorted_workers
+
+    def __len__(self):
+        return len(self.workers)
diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f9e7ab585a1d577f589e9c795a5560d830169a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py
@@ -0,0 +1,22 @@
+# isort: off
+try:
+    import horovod  # noqa: F401
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "Horovod isn't installed. To install Horovod with PyTorch support, run 'pip "
+        "install 'horovod[pytorch]''. To install Horovod with TensorFlow support, "
+        "run 'pip install 'horovod[tensorflow]''."
+    )
+# isort: on
+
+from ray.train.horovod.config import HorovodConfig
+from ray.train.horovod.horovod_trainer import HorovodTrainer
+from ray.train.v2._internal.constants import is_v2_enabled
+
+if is_v2_enabled():
+    from ray.train.v2.horovod.horovod_trainer import HorovodTrainer  # noqa: F811
+
+__all__ = ["HorovodConfig", "HorovodTrainer"]
+
+
+# DO NOT ADD ANYTHING AFTER THIS LINE.
diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c55fe6bb561e328104dba704d1067e22d44b0ca0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2f1c19cab0bcd7da37aec40fc2e37851fa2217c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcbf7251d4b2b71855ad473b0b95f782530ce783
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/config.py b/.venv/lib/python3.11/site-packages/ray/train/horovod/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd56091d3a4d7bc505bad007c0d394d103269ae
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/horovod/config.py
@@ -0,0 +1,159 @@
+import os
+from dataclasses import dataclass
+from typing import Optional, Set
+
+from horovod.ray.runner import Coordinator
+from horovod.ray.utils import detect_nics, nics_to_env_var
+from horovod.runner.common.util import secret, timeout
+
+import ray
+from ray.train._internal.utils import update_env_vars
+from ray.train._internal.worker_group import Worker, WorkerGroup
+from ray.train.backend import Backend, BackendConfig
+from ray.util import PublicAPI
+
+
+@PublicAPI(stability="beta")
+@dataclass
+class HorovodConfig(BackendConfig):
+    """Configurations for Horovod setup.
+
+    See https://github.com/horovod/horovod/blob/master/horovod/runner/common/util/settings.py # noqa: E501
+
+    Args:
+        nics (Optional[Set[str]): Network interfaces that can be used for
+            communication.
+        verbose: Horovod logging verbosity.
+        key (Optional[str]): Secret used for communication between workers.
+        ssh_port (Optional[int]): Port for SSH server running on worker nodes.
+        ssh_identity_file (Optional[str]): Path to the identity file to
+            ssh into different hosts on the cluster.
+        ssh_str (Optional[str]): CAUTION WHEN USING THIS. Private key
+            file contents. Writes the private key to ssh_identity_file.
+        timeout_s: Timeout parameter for Gloo rendezvous.
+        placement_group_timeout_s: Timeout parameter for Ray
+            Placement Group creation. Currently unused.
+    """
+
+    nics: Optional[Set[str]] = None
+    verbose: int = 1
+    key: Optional[str] = None
+    ssh_port: Optional[int] = None
+    ssh_identity_file: Optional[str] = None
+    ssh_str: Optional[str] = None
+    timeout_s: int = 300
+    placement_group_timeout_s: int = 100
+
+    @property
+    def start_timeout(self):
+        return timeout.Timeout(
+            self.timeout_s,
+            message="Timed out waiting for {activity}. Please "
+            "check connectivity between servers. You "
+            "may need to increase the --start-timeout "
+            "parameter if you have too many servers.",
+        )
+
+    def __post_init__(self):
+        if self.ssh_str and not os.path.exists(self.ssh_identity_file):
+            with open(self.ssh_identity_file, "w") as f:
+                os.chmod(self.ssh_identity_file, 0o600)
+                f.write(self.ssh_str)
+
+        if self.key is None:
+            self.key = secret.make_secret_key()
+
+    @property
+    def backend_cls(self):
+        return _HorovodBackend
+
+
+class _HorovodBackend(Backend):
+    share_cuda_visible_devices: bool = True
+
+    def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig):
+        # TODO(matt): Implement placement group strategies in BackendExecutor.
+
+        # Initialize workers with Horovod environment variables
+        setup_futures = []
+        for rank in range(len(worker_group)):
+            worker_node_id = worker_group.workers[rank].metadata.node_id
+            setup_futures.append(
+                worker_group.execute_single_async(
+                    rank,
+                    _init_env_vars,
+                    rank,
+                    len(worker_group),
+                    worker_node_id,
+                )
+            )
+        ray.get(setup_futures)
+
+        # Use Horovod Ray Coordinator
+        # backend_config as settings
+        self.coordinator = Coordinator(backend_config)
+
+        # Get all the hostnames of all workers
+        node_ids = [w.metadata.node_id for w in worker_group.workers]
+        hostnames = [w.metadata.hostname for w in worker_group.workers]
+        # Register each hostname to the coordinator. assumes the hostname
+        # ordering is the same.
+        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
+            self.coordinator.register(hostname, node_id, rank)
+        all_info = self.coordinator.finalize_registration()
+
+        setup_futures = []
+        for rank, local_cross_env_var in all_info.items():
+            setup_futures.append(
+                worker_group.execute_single_async(
+                    rank, update_env_vars, local_cross_env_var
+                )
+            )
+        ray.get(setup_futures)
+
+        coordinator_envs = self.coordinator.establish_rendezvous()
+
+        # Get one worker from each host/node.
+        node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)]
+        node_workers = [
+            _HorovodWorkerWrapper(worker_group.workers[worker_index])
+            for worker_index in node_worker_indexes
+        ]
+        assert len(node_workers) == len(self.coordinator.hostnames)
+
+        nics = detect_nics(
+            backend_config,
+            all_host_names=list(self.coordinator.hostnames),
+            node_workers=node_workers,
+        )
+        coordinator_envs.update(nics_to_env_var(nics))
+
+        worker_group.execute(update_env_vars, coordinator_envs)
+
+
+def _init_env_vars(world_rank: int, world_size: int, node_id: str):
+    """Initialize Horovod environment variables."""
+    os.environ["HOROVOD_HOSTNAME"] = node_id
+    os.environ["HOROVOD_RANK"] = str(world_rank)
+    os.environ["HOROVOD_SIZE"] = str(world_size)
+
+
+# TODO(tgaddair): temporary workaround for Horovod's worker discovery logic,
+#  which requires passing in an extra parameter as part of the RayExecutor
+#  API. This will be removed in the future as we migrate more of the
+#  RayExecutor utils into Ray Train.
+#  See: https://github.com/horovod/horovod/blob/v0.23.0/horovod/ray/driver_service.py#L9 # noqa: E501
+@dataclass
+class _HorovodWorkerWrapper:
+    w: Worker
+
+    @property
+    def execute(self):
+        w = self.w
+
+        class ExecuteHandle:
+            def remote(self, func, *args, **kwargs):
+                _ = None
+                return w.actor._RayTrainWorker__execute.remote(func, _, *args, **kwargs)
+
+        return ExecuteHandle()
diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d97572d7935814d3a18747d2a2b22d5c09c179
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py
@@ -0,0 +1,202 @@
+from typing import Any, Callable, Dict, Optional, Union
+
+from ray.air.config import RunConfig, ScalingConfig
+from ray.train import Checkpoint, DataConfig
+from ray.train.data_parallel_trainer import DataParallelTrainer
+from ray.train.horovod.config import HorovodConfig
+from ray.train.trainer import GenDataset
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="beta")
+class HorovodTrainer(DataParallelTrainer):
+    """A Trainer for data parallel Horovod training.
+
+    This Trainer runs the function ``train_loop_per_worker`` on multiple Ray
+    Actors. These actors already have the necessary Horovod setup already
+    configured for distributed Horovod training.
+
+    The ``train_loop_per_worker`` function is expected to take in either 0 or 1
+    arguments:
+
+    .. testcode::
+
+        def train_loop_per_worker():
+            ...
+
+    .. testcode::
+
+        def train_loop_per_worker(config: Dict):
+            ...
+
+    If ``train_loop_per_worker`` accepts an argument, then
+    ``train_loop_config`` will be passed in as the argument. This is useful if you
+    want to tune the values in ``train_loop_config`` as hyperparameters.
+
+    If the ``datasets`` dict contains a training dataset (denoted by
+    the "train" key), then it will be split into multiple dataset
+    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
+    ``train_loop_per_worker``. All the other datasets will not be split and
+    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.
+
+    Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray Train loop methods <train-loop-api>`.
+
+    .. testcode::
+
+        from ray import train
+
+        def train_loop_per_worker():
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            train.report(...)
+
+            # Returns dict of last saved checkpoint.
+            train.get_checkpoint()
+
+            # Returns the Dataset shard for the given key.
+            train.get_dataset_shard("my_dataset")
+
+            # Returns the total number of workers executing training.
+            train.get_context().get_world_size()
+
+            # Returns the rank of this worker.
+            train.get_context().get_world_rank()
+
+            # Returns the rank of the worker on the current node.
+            train.get_context().get_local_rank()
+
+    Any returns from the ``train_loop_per_worker`` will be discarded and not
+    used or persisted anywhere.
+
+    You could use ``TensorflowPredictor`` or ``TorchPredictor`` in conjunction with
+    HorovodTrainer. You must save the model under the "model" kwarg in the
+    ``Checkpoint`` passed to ``train.report()``, so that it can be used by
+    corresponding predictors.
+
+    Example:
+
+
+    .. testcode::
+        :skipif: True
+
+        import os
+        import tempfile
+
+        import ray
+        import horovod.torch as hvd
+        import torch
+        import torch.nn as nn
+
+        from ray import train
+        import ray.train.torch  # Need this to use `train.torch.get_device()`
+        from ray.train import Checkpoint, ScalingConfig
+        from ray.train.horovod import HorovodTrainer
+
+        # If using GPUs, set this to True.
+        use_gpu = False
+
+        input_size = 1
+        layer_size = 15
+        output_size = 1
+        num_epochs = 3
+
+        class NeuralNetwork(nn.Module):
+            def __init__(self):
+                super(NeuralNetwork, self).__init__()
+                self.layer1 = nn.Linear(input_size, layer_size)
+                self.relu = nn.ReLU()
+                self.layer2 = nn.Linear(layer_size, output_size)
+            def forward(self, input):
+                return self.layer2(self.relu(self.layer1(input)))
+
+        def train_loop_per_worker():
+            hvd.init()
+            dataset_shard = train.get_dataset_shard("train")
+            model = NeuralNetwork()
+            device = train.torch.get_device()
+            model.to(device)
+            loss_fn = nn.MSELoss()
+            lr_scaler = 1
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.1 * lr_scaler)
+            # Horovod: wrap optimizer with DistributedOptimizer.
+            optimizer = hvd.DistributedOptimizer(
+                optimizer,
+                named_parameters=model.named_parameters(),
+                op=hvd.Average,
+            )
+            for epoch in range(num_epochs):
+                model.train()
+                for batch in dataset_shard.iter_torch_batches(
+                    batch_size=32, dtypes=torch.float
+                ):
+                    inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"]
+                    outputs = model(inputs)
+                    loss = loss_fn(outputs, labels)
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+                    print(f"epoch: {epoch}, loss: {loss.item()}")
+
+                # Save a model checkpoint at the end of each epoch
+                with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                    ckpt_path = os.path.join(temp_checkpoint_dir, "model.pt")
+                    torch.save(model.state_dict(), ckpt_path)
+                    train.report(
+                        {"loss": loss.item(), "epoch": epoch},
+                        checkpoint=Checkpoint.from_directory(temp_checkpoint_dir),
+                    )
+
+        train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+        scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
+        trainer = HorovodTrainer(
+            train_loop_per_worker=train_loop_per_worker,
+            scaling_config=scaling_config,
+            datasets={"train": train_dataset},
+        )
+        result = trainer.fit()
+
+    Args:
+        train_loop_per_worker: The training function to execute.
+            This can either take in no arguments or a ``config`` dict.
+        train_loop_config: Configurations to pass into
+            ``train_loop_per_worker`` if it accepts an argument.
+        horovod_config: Configuration for setting up the Horovod backend.
+            If set to None, use the default configuration. This replaces the
+            ``backend_config`` arg of ``DataParallelTrainer``.
+        scaling_config: Configuration for how to scale data parallel training.
+        dataset_config: Configuration for dataset ingest.
+        run_config: Configuration for the execution of the training run.
+        datasets: Any Datasets to use for training. Use
+            the key "train" to denote which dataset is the training
+            dataset.
+        resume_from_checkpoint: A checkpoint to resume training from.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        horovod_config: Optional[HorovodConfig] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        dataset_config: Optional[DataConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        super().__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=horovod_config or HorovodConfig(),
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8e413a1030837512e86386b76f1c12ce21b2a92
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py
@@ -0,0 +1,39 @@
+# isort: off
+try:
+    import lightning  # noqa: F401
+except ModuleNotFoundError:
+    try:
+        import pytorch_lightning  # noqa: F401
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "PyTorch Lightning isn't installed. To install PyTorch Lightning, "
+            "please run 'pip install lightning'"
+        )
+# isort: on
+
+from ray.train.lightning._lightning_utils import (
+    RayDDPStrategy,
+    RayDeepSpeedStrategy,
+    RayFSDPStrategy,
+    RayLightningEnvironment,
+    RayTrainReportCallback,
+    prepare_trainer,
+)
+from ray.train.v2._internal.constants import is_v2_enabled
+
+if is_v2_enabled():
+    from ray.train.v2.lightning.lightning_utils import (  # noqa: F811
+        RayTrainReportCallback,
+    )
+
+__all__ = [
+    "prepare_trainer",
+    "RayDDPStrategy",
+    "RayFSDPStrategy",
+    "RayDeepSpeedStrategy",
+    "RayLightningEnvironment",
+    "RayTrainReportCallback",
+]
+
+
+# DO NOT ADD ANYTHING AFTER THIS LINE.
diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc0c6e0201df800a354fdf824cda9ba7b94ea821
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/_lightning_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/_lightning_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2397b6a8179b4c3cc4c01a080a333f7e4ab0ddd7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/_lightning_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py b/.venv/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba42fe12f4ba36db70620a7aeba310a6b0ccc66f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py
@@ -0,0 +1,295 @@
+import logging
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Any, Dict
+
+import torch
+from packaging.version import Version
+
+import ray
+from ray import train
+from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
+from ray.train import Checkpoint
+from ray.util import PublicAPI
+
+
+def import_lightning():  # noqa: F402
+    try:
+        import lightning.pytorch as pl
+    except ModuleNotFoundError:
+        import pytorch_lightning as pl
+    return pl
+
+
+pl = import_lightning()
+
+_LIGHTNING_GREATER_EQUAL_2_0 = Version(pl.__version__) >= Version("2.0.0")
+_LIGHTNING_LESS_THAN_2_1 = Version(pl.__version__) < Version("2.1.0")
+_TORCH_GREATER_EQUAL_1_12 = Version(torch.__version__) >= Version("1.12.0")
+_TORCH_FSDP_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.distributed.is_available()
+
+try:
+    from lightning.pytorch.plugins.environments import LightningEnvironment
+except ModuleNotFoundError:
+    from pytorch_lightning.plugins.environments import LightningEnvironment
+
+if _LIGHTNING_GREATER_EQUAL_2_0:
+    FSDPStrategy = pl.strategies.FSDPStrategy
+else:
+    FSDPStrategy = pl.strategies.DDPFullyShardedStrategy
+
+if _TORCH_FSDP_AVAILABLE:
+    from torch.distributed.fsdp import (
+        FullStateDictConfig,
+        FullyShardedDataParallel,
+        StateDictType,
+    )
+
+
+logger = logging.getLogger(__name__)
+
+LIGHTNING_REPORT_STAGE_KEY = "_report_on"
+
+
+@PublicAPI(stability="beta")
+class RayDDPStrategy(pl.strategies.DDPStrategy):
+    """Subclass of DDPStrategy to ensure compatibility with Ray orchestration.
+
+    For a full list of initialization arguments, please refer to:
+    https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.DDPStrategy.html
+
+    Note that `process_group_backend`, `timeout`, and `start_method` are disabled here,
+    please specify these arguments in :class:`~ray.train.torch.TorchConfig` instead.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYDDPSTRATEGY, "1")
+
+    @property
+    def root_device(self) -> torch.device:
+        return ray.train.torch.get_device()
+
+    @property
+    def distributed_sampler_kwargs(self) -> Dict[str, Any]:
+        return dict(
+            num_replicas=self.world_size,
+            rank=self.global_rank,
+        )
+
+
+@PublicAPI(stability="beta")
+class RayFSDPStrategy(FSDPStrategy):  # noqa: F821
+    """Subclass of FSDPStrategy to ensure compatibility with Ray orchestration.
+
+    For a full list of initialization arguments, please refer to:
+    https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.FSDPStrategy.html
+
+    .. note::
+        It is recommended to upgrade `lightning>=2.1` or above when using FSDP
+        with Lightning, since Lightning starts to natively support `state_dict_type`,
+        `sharding_strategy`, `auto_wrap_policy` and other FSDP configurations from 2.1.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYFSDPSTRATEGY, "1")
+
+    @property
+    def root_device(self) -> torch.device:
+        return ray.train.torch.get_device()
+
+    @property
+    def distributed_sampler_kwargs(self) -> Dict[str, Any]:
+        return dict(
+            num_replicas=self.world_size,
+            rank=self.global_rank,
+        )
+
+    def lightning_module_state_dict(self) -> Dict[str, Any]:
+        """Gathers the full state dict to rank 0 on CPU.
+
+        FSDP checkpointing is broken in Lightning 2.0.x. This subclass patches the
+        behavior to perform a full state dict checkpointing, gathering the checkpoint
+        shards on rank 0 CPU. Upgrade to `lightning>=2.1` to do sharded state dict
+        checkpointing.
+
+        See the note in the class docstring for more details.
+        """
+
+        assert self.model is not None, "Failed to get the state dict for a None model!"
+
+        if (
+            _TORCH_FSDP_AVAILABLE
+            and _LIGHTNING_GREATER_EQUAL_2_0
+            and _LIGHTNING_LESS_THAN_2_1
+        ):
+            with FullyShardedDataParallel.state_dict_type(
+                module=self.model,
+                state_dict_type=StateDictType.FULL_STATE_DICT,
+                state_dict_config=FullStateDictConfig(
+                    offload_to_cpu=True, rank0_only=True
+                ),
+            ):
+                state_dict = self.model.state_dict()
+
+                ckpt_state_dict = {}
+                prefix_len = len("_forward_module.")
+                for k, v in state_dict.items():
+                    if k.startswith("_forward_module."):
+                        non_prefixed_key = k[prefix_len:]
+                        ckpt_state_dict[non_prefixed_key] = v
+                    else:
+                        ckpt_state_dict[k] = v
+                return ckpt_state_dict
+        else:
+            # Otherwise Lightning uses Fairscale FSDP, no need to unshard by ourself.
+            return super().lightning_module_state_dict()
+
+
+@PublicAPI(stability="beta")
+class RayDeepSpeedStrategy(pl.strategies.DeepSpeedStrategy):
+    """Subclass of DeepSpeedStrategy to ensure compatibility with Ray orchestration.
+
+    For a full list of initialization arguments, please refer to:
+    https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.DeepSpeedStrategy.html
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYDEEPSPEEDSTRATEGY, "1")
+
+    @property
+    def root_device(self) -> torch.device:
+        return ray.train.torch.get_device()
+
+    @property
+    def distributed_sampler_kwargs(self) -> Dict[str, Any]:
+        return dict(
+            num_replicas=self.world_size,
+            rank=self.global_rank,
+        )
+
+
+@PublicAPI(stability="beta")
+class RayLightningEnvironment(LightningEnvironment):  # noqa: F821
+    """Setup Lightning DDP training environment for Ray cluster."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYLIGHTNINGENVIRONMENT, "1")
+
+    def world_size(self) -> int:
+        return train.get_context().get_world_size()
+
+    def global_rank(self) -> int:
+        return train.get_context().get_world_rank()
+
+    def local_rank(self) -> int:
+        return train.get_context().get_local_rank()
+
+    def node_rank(self) -> int:
+        return train.get_context().get_node_rank()
+
+    def set_world_size(self, size: int) -> None:
+        # Disable it since `world_size()` directly returns data from Train context.
+        pass
+
+    def set_global_rank(self, rank: int) -> None:
+        # Disable it since `global_rank()` directly returns data from Train.
+        pass
+
+    def teardown(self):
+        pass
+
+
+@PublicAPI(stability="beta")
+def prepare_trainer(trainer: pl.Trainer) -> pl.Trainer:
+    """Prepare the PyTorch Lightning Trainer for distributed execution."""
+
+    # Check strategy class
+    valid_strategy_class = [RayDDPStrategy, RayFSDPStrategy, RayDeepSpeedStrategy]
+
+    if not any(isinstance(trainer.strategy, cls) for cls in valid_strategy_class):
+        raise RuntimeError(
+            f"Invalid strategy class: {type(trainer.strategy)}. To use "
+            "PyTorch Lightning with Ray, the strategy object should be one of "
+            f"{[cls.__name__ for cls in valid_strategy_class]} class "
+            "or its subclass."
+        )
+
+    # Check cluster environment
+    cluster_environment = getattr(trainer.strategy, "cluster_environment", None)
+    if cluster_environment and not isinstance(
+        cluster_environment, RayLightningEnvironment
+    ):
+        raise RuntimeError(
+            "Invalid cluster environment plugin. The expected class is"
+            "`ray.train.lightning.RayLightningEnvironment` "
+            f"but got {type(cluster_environment)}!"
+        )
+
+    record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_PREPARE_TRAINER, "1")
+    return trainer
+
+
+@PublicAPI(stability="beta")
+class RayTrainReportCallback(pl.callbacks.Callback):
+    """A simple callback that reports checkpoints to Ray on train epoch end.
+
+    This callback is a subclass of `lightning.pytorch.callbacks.Callback
+    <https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.Callback.html#lightning.pytorch.callbacks.Callback>`_.
+
+    It fetches the latest `trainer.callback_metrics` and reports together with
+    the checkpoint on each training epoch end.
+
+    Checkpoints will be saved in the following structure::
+
+        checkpoint_00000*/      Ray Train Checkpoint
+        └─ checkpoint.ckpt      PyTorch Lightning Checkpoint
+
+    For customized reporting and checkpointing logic, implement your own
+    `lightning.pytorch.callbacks.Callback` following this user
+    guide: :ref:`Saving and Loading Checkpoints <train-dl-saving-checkpoints>`.
+    """
+
+    CHECKPOINT_NAME = "checkpoint.ckpt"
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.trial_name = train.get_context().get_trial_name()
+        self.local_rank = train.get_context().get_local_rank()
+        self.tmpdir_prefix = Path(tempfile.gettempdir(), self.trial_name).as_posix()
+        if os.path.isdir(self.tmpdir_prefix) and self.local_rank == 0:
+            shutil.rmtree(self.tmpdir_prefix)
+
+        record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYTRAINREPORTCALLBACK, "1")
+
+    def on_train_epoch_end(self, trainer, pl_module) -> None:
+        # Creates a checkpoint dir with fixed name
+        tmpdir = Path(self.tmpdir_prefix, str(trainer.current_epoch)).as_posix()
+        os.makedirs(tmpdir, exist_ok=True)
+
+        # Fetch metrics
+        metrics = trainer.callback_metrics
+        metrics = {k: v.item() for k, v in metrics.items()}
+
+        # (Optional) Add customized metrics
+        metrics["epoch"] = trainer.current_epoch
+        metrics["step"] = trainer.global_step
+
+        # Save checkpoint to local
+        ckpt_path = Path(tmpdir, self.CHECKPOINT_NAME).as_posix()
+        trainer.save_checkpoint(ckpt_path, weights_only=False)
+
+        # Report to train session
+        checkpoint = Checkpoint.from_directory(tmpdir)
+        train.report(metrics=metrics, checkpoint=checkpoint)
+
+        # Add a barrier to ensure all workers finished reporting here
+        trainer.strategy.barrier()
+
+        if self.local_rank == 0:
+            shutil.rmtree(tmpdir)
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db989336afd15a2d2147d45f9b6685f3e823598c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/__init__.py
@@ -0,0 +1,56 @@
+# isort: off
+try:
+    import torch  # noqa: F401
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "PyTorch isn't installed. To install PyTorch, run 'pip install torch'"
+    )
+# isort: on
+
+from ray.train.torch.config import TorchConfig
+from ray.train.torch.torch_checkpoint import TorchCheckpoint
+from ray.train.torch.torch_detection_predictor import TorchDetectionPredictor
+from ray.train.torch.torch_predictor import TorchPredictor
+from ray.train.torch.torch_trainer import TorchTrainer
+from ray.train.torch.train_loop_utils import (
+    accelerate,
+    backward,
+    enable_reproducibility,
+    get_device,
+    get_devices,
+    prepare_data_loader,
+    prepare_model,
+    prepare_optimizer,
+)
+from ray.train.v2._internal.constants import is_v2_enabled
+
+if is_v2_enabled():
+    from ray.train.v2.torch.torch_trainer import TorchTrainer  # noqa: F811
+    from ray.train.v2.torch.train_loop_utils import (  # noqa: F811
+        accelerate,
+        backward,
+        enable_reproducibility,
+        prepare_data_loader,
+        prepare_model,
+        prepare_optimizer,
+    )
+
+
+__all__ = [
+    "TorchTrainer",
+    "TorchCheckpoint",
+    "TorchConfig",
+    "accelerate",
+    "get_device",
+    "get_devices",
+    "prepare_model",
+    "prepare_optimizer",
+    "prepare_data_loader",
+    "backward",
+    "enable_reproducibility",
+    "TorchPredictor",
+    "TorchDetectionPredictor",
+]
+
+
+# DO NOT ADD ANYTHING AFTER THIS LINE.
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d891e2dffd497d7d9fd0d8b6b0bfc71b01cd16e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd78d8f68b7ca8289824aee66cf8b41ed5add595
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_checkpoint.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffda8831838ce5559b2c84d1e1a323f47294b203
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_checkpoint.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_detection_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_detection_predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac54b56fb43d2015fd69d8ee1c372ce3587d6c92
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_detection_predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d334e9153056cde09ca4870a7b611704a137696
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78a06db6591e2f6169a947f25a6f179e6f5111dc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_trainer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/train_loop_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/train_loop_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f6a266fd2d31ca9c6fa04b1113af3b3530d98dd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/train_loop_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/config.py b/.venv/lib/python3.11/site-packages/ray/train/torch/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ecc61e3b874e1624b4e5ccdd3ae0e77a176d68
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/config.py
@@ -0,0 +1,213 @@
+import logging
+import os
+from dataclasses import dataclass
+from datetime import timedelta
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from packaging.version import Version
+
+import ray
+from ray.air._internal.device_manager import register_custom_torch_dist_backend
+from ray.train._internal.utils import get_address_and_port
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import Backend, BackendConfig
+from ray.util import PublicAPI
+
+logger = logging.getLogger(__name__)
+
+
+class TorchConfigContextManager:
+    def __enter__(self):
+        # Set default cuda device
+        if torch.cuda.is_available():
+            device = ray.train.torch.get_device()
+            if device.type == "cuda":
+                torch.cuda.set_device(device)
+
+    def __exit__(self, type, value, traceback):
+        # Propagate exceptions if any
+        return False
+
+
+@PublicAPI(stability="stable")
+@dataclass
+class TorchConfig(BackendConfig):
+    """Configuration for torch process group setup.
+
+    See https://pytorch.org/docs/stable/distributed.html for more info.
+
+    Args:
+        backend: The backend to use for training.
+            See ``torch.distributed.init_process_group`` for more info and
+            valid values.
+            If set to None, nccl will be used if GPUs are requested, else gloo
+            will be used.
+        init_method: The initialization method to use. Either "env"
+            for environment variable initialization or "tcp" for TCP
+            initialization. Defaults to "env".
+        timeout_s: Seconds for process group operations to timeout.
+    """
+
+    backend: Optional[str] = None
+    init_method: str = "env"
+    timeout_s: int = 1800
+
+    @property
+    def backend_cls(self):
+        return _TorchBackend
+
+    @property
+    def train_func_context(self):
+        return TorchConfigContextManager
+
+
+def _setup_torch_process_group(
+    backend: str,
+    world_rank: int,
+    world_size: int,
+    init_method: str,
+    timeout_s: int = 1800,
+):
+    """Connects the distributed PyTorch backend.
+
+    Args:
+        backend: The backend (nccl, gloo, etc.) to use for training.
+        world_rank: Rank of the current worker.
+        world_size: Number of workers participating in the job.
+        init_method: URL specifying how to initialize the process group.
+        timeout_s: Seconds for process group operations to timeout.
+    """
+    if world_rank == 0:
+        logger.info(
+            f"Setting up process group for: {init_method} [rank={world_rank}, "
+            f"world_size={world_size}]"
+        )
+    else:
+        logger.debug(
+            f"Setting up process group for: {init_method} [rank={world_rank}, "
+            f"world_size={world_size}]"
+        )
+    logger.debug(f"using {backend}")
+
+    if backend == "nccl":
+        # See https://github.com/pytorch/pytorch/blob/c263bd43e8e8502d4726643bc6fd046f0130ac0e/torch/distributed/distributed_c10d.py#L803-L823 # noqa: E501
+        # We do not use TORCH_NCCL_BLOCKING_WAIT due to performance overhead.
+        if Version(torch.__version__) < Version("2.2.0"):
+            TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR = "NCCL_ASYNC_ERROR_HANDLING"
+            TORCH_NCCL_BLOCKING_WAIT_ENV_VAR = "NCCL_BLOCKING_WAIT"
+        else:
+            TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR = "TORCH_NCCL_ASYNC_ERROR_HANDLING"
+            TORCH_NCCL_BLOCKING_WAIT_ENV_VAR = "TORCH_NCCL_BLOCKING_WAIT"
+        if (
+            TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR not in os.environ
+            and TORCH_NCCL_BLOCKING_WAIT_ENV_VAR not in os.environ
+        ):
+            logger.debug(
+                f"Setting {TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR}=1 to fail if NCCL collective communication operations are timing out. "  # noqa: E501
+                f"To override this behavior, you can set {TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR}=0."  # noqa: E501
+            )
+            os.environ[TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR] = "1"
+    elif backend == "hccl":
+        register_custom_torch_dist_backend(backend)
+
+    dist.init_process_group(
+        backend=backend,
+        init_method=init_method,
+        rank=world_rank,
+        world_size=world_size,
+        timeout=timedelta(seconds=timeout_s),
+    )
+
+
+def _shutdown_torch(destroy_process_group=False):
+    from ray.air._internal.torch_utils import get_devices
+
+    devices = get_devices()
+    if destroy_process_group:
+        dist.destroy_process_group()
+    if torch.cuda.is_available():
+        for device in devices:
+            with torch.cuda.device(device):
+                torch.cuda.empty_cache()
+
+
+def _set_torch_distributed_env_vars():
+    # Same env vars as in
+    # https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+    from ray.train.torch import get_device
+
+    context = ray.train.get_context()
+    os.environ["LOCAL_RANK"] = str(context.get_local_rank())
+    os.environ["RANK"] = str(context.get_world_rank())
+    os.environ["LOCAL_WORLD_SIZE"] = str(context.get_local_world_size())
+    os.environ["WORLD_SIZE"] = str(context.get_world_size())
+    os.environ["NODE_RANK"] = str(context.get_node_rank())
+
+    # Makes sure Hugging Face Accelerate uses the correct device
+    device = get_device()
+    os.environ["ACCELERATE_TORCH_DEVICE"] = str(device)
+
+
+class _TorchBackend(Backend):
+    share_cuda_visible_devices: bool = True
+
+    def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
+        if dist.is_available():
+            # Set the appropriate training backend.
+            if backend_config.backend is None:
+                if worker_group.num_gpus_per_worker > 0:
+                    backend = "nccl"
+                else:
+                    backend = "gloo"
+            else:
+                backend = backend_config.backend
+
+            master_addr, master_port = worker_group.execute_single(
+                0, get_address_and_port
+            )
+            if backend_config.init_method == "env":
+
+                def set_env_vars(addr, port):
+                    os.environ["MASTER_ADDR"] = addr
+                    os.environ["MASTER_PORT"] = str(port)
+
+                worker_group.execute(set_env_vars, addr=master_addr, port=master_port)
+                url = "env://"
+            elif backend_config.init_method == "tcp":
+                url = f"tcp://{master_addr}:{master_port}"
+            else:
+                raise ValueError(
+                    f"The provided init_method ("
+                    f"{backend_config.init_method}) is not supported. Must "
+                    f"be either 'env' or 'tcp'."
+                )
+
+            setup_futures = []
+            for i in range(len(worker_group)):
+                setup_futures.append(
+                    worker_group.execute_single_async(
+                        i,
+                        _setup_torch_process_group,
+                        backend=backend,
+                        world_rank=i,
+                        world_size=len(worker_group),
+                        init_method=url,
+                        timeout_s=backend_config.timeout_s,
+                    )
+                )
+            ray.get(setup_futures)
+        else:
+            raise RuntimeError("Distributed torch is not available.")
+
+    def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchConfig):
+        worker_group.execute(
+            _shutdown_torch,
+            destroy_process_group=len(worker_group) > 1,
+        )
+
+    def on_training_start(
+        self, worker_group: WorkerGroup, backend_config: BackendConfig
+    ):
+        worker_group.execute(_set_torch_distributed_env_vars)
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b6eeae305185bdd638042a7c1a0386f8d26bd86
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_checkpoint.py
@@ -0,0 +1,182 @@
+import os
+import tempfile
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+import torch
+
+from ray.air._internal.torch_utils import (
+    consume_prefix_in_state_dict_if_present_not_in_place,
+    load_torch_model,
+)
+from ray.train._internal.framework_checkpoint import FrameworkCheckpoint
+from ray.util.annotations import PublicAPI
+
+if TYPE_CHECKING:
+    from ray.data.preprocessor import Preprocessor
+
+ENCODED_DATA_KEY = "torch_encoded_data"
+
+
+@PublicAPI(stability="beta")
+class TorchCheckpoint(FrameworkCheckpoint):
+    """A :class:`~ray.train.Checkpoint` with Torch-specific functionality."""
+
+    MODEL_FILENAME = "model.pt"
+
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, Any],
+        *,
+        preprocessor: Optional["Preprocessor"] = None,
+    ) -> "TorchCheckpoint":
+        """Create a :class:`~ray.train.Checkpoint` that stores a model state dictionary.
+
+        .. tip::
+
+            This is the recommended method for creating
+            :class:`TorchCheckpoints<TorchCheckpoint>`.
+
+        Args:
+            state_dict: The model state dictionary to store in the checkpoint.
+            preprocessor: A fitted preprocessor to be applied before inference.
+
+        Returns:
+            A :class:`TorchCheckpoint` containing the specified state dictionary.
+
+        Examples:
+
+            .. testcode::
+
+                import torch
+                import torch.nn as nn
+                from ray.train.torch import TorchCheckpoint
+
+                # Set manual seed
+                torch.manual_seed(42)
+
+                # Function to create a NN model
+                def create_model() -> nn.Module:
+                    model = nn.Sequential(nn.Linear(1, 10),
+                            nn.ReLU(),
+                            nn.Linear(10,1))
+                    return model
+
+                # Create a TorchCheckpoint from our model's state_dict
+                model = create_model()
+                checkpoint = TorchCheckpoint.from_state_dict(model.state_dict())
+
+                # Now load the model from the TorchCheckpoint by providing the
+                # model architecture
+                model_from_chkpt = checkpoint.get_model(create_model())
+
+                # Assert they have the same state dict
+                assert str(model.state_dict()) == str(model_from_chkpt.state_dict())
+                print("worked")
+
+            .. testoutput::
+                :hide:
+
+                ...
+        """
+        tempdir = tempfile.mkdtemp()
+
+        model_path = Path(tempdir, cls.MODEL_FILENAME).as_posix()
+        stripped_state_dict = consume_prefix_in_state_dict_if_present_not_in_place(
+            state_dict, "module."
+        )
+        torch.save(stripped_state_dict, model_path)
+
+        checkpoint = cls.from_directory(tempdir)
+        if preprocessor:
+            checkpoint.set_preprocessor(preprocessor)
+        return checkpoint
+
+    @classmethod
+    def from_model(
+        cls,
+        model: torch.nn.Module,
+        *,
+        preprocessor: Optional["Preprocessor"] = None,
+    ) -> "TorchCheckpoint":
+        """Create a :class:`~ray.train.Checkpoint` that stores a Torch model.
+
+        .. note::
+
+            PyTorch recommends storing state dictionaries. To create a
+            :class:`TorchCheckpoint` from a state dictionary, call
+            :meth:`~ray.train.torch.TorchCheckpoint.from_state_dict`. To learn more
+            about state dictionaries, read
+            `Saving and Loading Models <https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict>`_. # noqa: E501
+
+        Args:
+            model: The Torch model to store in the checkpoint.
+            preprocessor: A fitted preprocessor to be applied before inference.
+
+        Returns:
+            A :class:`TorchCheckpoint` containing the specified model.
+
+        Examples:
+
+            .. testcode::
+
+                from ray.train.torch import TorchCheckpoint
+                import torch
+
+                # Create model identity and send a random tensor to it
+                model = torch.nn.Identity()
+                input = torch.randn(2, 2)
+                output = model(input)
+
+                # Create a checkpoint
+                checkpoint = TorchCheckpoint.from_model(model)
+                print(checkpoint)
+
+            .. testoutput::
+                :hide:
+
+                ...
+        """
+        tempdir = tempfile.mkdtemp()
+
+        model_path = Path(tempdir, cls.MODEL_FILENAME).as_posix()
+        torch.save(model, model_path)
+
+        checkpoint = cls.from_directory(tempdir)
+        if preprocessor:
+            checkpoint.set_preprocessor(preprocessor)
+        return checkpoint
+
+    def get_model(self, model: Optional[torch.nn.Module] = None) -> torch.nn.Module:
+        """Retrieve the model stored in this checkpoint.
+
+        Args:
+            model: If the checkpoint contains a model state dict, and not
+                the model itself, then the state dict will be loaded to this
+                ``model``. Otherwise, the model will be discarded.
+        """
+        with self.as_directory() as tempdir:
+            model_path = Path(tempdir, self.MODEL_FILENAME).as_posix()
+            if not os.path.exists(model_path):
+                raise RuntimeError(
+                    "`model.pt` not found within this checkpoint. Make sure you "
+                    "created this `TorchCheckpoint` from one of its public "
+                    "constructors (`from_state_dict` or `from_model`)."
+                )
+            model_or_state_dict = torch.load(model_path, map_location="cpu")
+
+        if isinstance(model_or_state_dict, torch.nn.Module):
+            if model:
+                warnings.warn(
+                    "TorchCheckpoint already contains all information needed. "
+                    "Discarding provided `model` argument. If you are using "
+                    "TorchPredictor directly, you should do "
+                    "`TorchPredictor.from_checkpoint(checkpoint)` by removing kwargs "
+                    "`model=`."
+                )
+        model = load_torch_model(
+            saved_model=model_or_state_dict, model_definition=model
+        )
+        return model
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_detection_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_detection_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a708a731ca2976d57c7866dfc0707d5c255545
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_detection_predictor.py
@@ -0,0 +1,90 @@
+import collections
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+from ray.train._internal.dl_predictor import TensorDtype
+from ray.train.torch.torch_predictor import TorchPredictor
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class TorchDetectionPredictor(TorchPredictor):
+    """A predictor for TorchVision detection models.
+
+    Unlike other Torch models, instance segmentation models return
+    `List[Dict[str, Tensor]]`. This predictor extends :class:`TorchPredictor` to support
+    the non-standard outputs.
+
+    To learn more about instance segmentation models, read
+    `Instance segmentation models <https://pytorch.org/vision/main/auto_examples/plot_visualization_utils.html#instance-seg-output>`_.
+
+    Example:
+
+        .. testcode::
+
+            import numpy as np
+            from torchvision import models
+
+            from ray.train.torch import TorchDetectionPredictor
+
+            model = models.detection.fasterrcnn_resnet50_fpn_v2(pretrained=True)
+
+            predictor = TorchDetectionPredictor(model=model)
+            predictions = predictor.predict(np.zeros((4, 3, 32, 32), dtype=np.float32))
+
+            print(predictions.keys())
+
+        .. testoutput::
+
+            dict_keys(['pred_boxes', 'pred_labels', 'pred_scores'])
+
+    """  # noqa: E501
+
+    def _predict_numpy(
+        self,
+        data: Union[np.ndarray, Dict[str, np.ndarray]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
+    ) -> Dict[str, np.ndarray]:
+        if isinstance(data, dict) and len(data) != 1:
+            raise ValueError(
+                f"""Expected input to contain one key, but got {len(data)} instead."""
+            )
+
+        if dtype is not None and not isinstance(dtype, torch.dtype):
+            raise ValueError(
+                "Expected `dtype` to be a `torch.dtype`, but got a "
+                f"{type(dtype).__name__} instead."
+            )
+
+        if isinstance(data, dict):
+            images = next(iter(data.values()))
+        else:
+            images = data
+
+        inputs = [
+            torch.as_tensor(image, dtype=dtype).to(self.device) for image in images
+        ]
+        outputs = self.call_model(inputs)
+        outputs = _convert_outputs_to_batch(outputs)
+        outputs = {"pred_" + key: value for key, value in outputs.items()}
+
+        return outputs
+
+
+def _convert_outputs_to_batch(
+    outputs: List[Dict[str, torch.Tensor]],
+) -> Dict[str, List[torch.Tensor]]:
+    """Batch detection model outputs.
+
+    TorchVision detection models return `List[Dict[Tensor]]`. Each `Dict` contain
+    'boxes', 'labels, and 'scores'.
+
+    This function batches values and returns a `Dict[str, List[Tensor]]`.
+    """  # noqa: E501
+    batch = collections.defaultdict(list)
+    for output in outputs:
+        for key, value in output.items():
+            batch[key].append(value.cpu().detach())
+    return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..be81d5b0f68c0a397239940bc9b310fc79a0b864
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_predictor.py
@@ -0,0 +1,250 @@
+import logging
+from typing import TYPE_CHECKING, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from ray.air._internal.torch_utils import convert_ndarray_batch_to_torch_tensor_batch
+from ray.train._internal.dl_predictor import DLPredictor
+from ray.train.predictor import DataBatchType
+from ray.train.torch import TorchCheckpoint
+from ray.util import log_once
+from ray.util.annotations import DeveloperAPI, PublicAPI
+
+if TYPE_CHECKING:
+    from ray.data.preprocessor import Preprocessor
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI(stability="beta")
+class TorchPredictor(DLPredictor):
+    """A predictor for PyTorch models.
+
+    Args:
+        model: The torch module to use for predictions.
+        preprocessor: A preprocessor used to transform data batches prior
+            to prediction.
+        use_gpu: If set, the model will be moved to GPU on instantiation and
+            prediction happens on GPU.
+    """
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        preprocessor: Optional["Preprocessor"] = None,
+        use_gpu: bool = False,
+    ):
+        self.model = model
+        self.model.eval()
+        self.use_gpu = use_gpu
+
+        if use_gpu:
+            # TODO (jiaodong): #26249 Use multiple GPU devices with sharded input
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+
+        # Ensure input tensor and model live on the same device
+        self.model.to(self.device)
+
+        if (
+            not use_gpu
+            and torch.cuda.device_count() > 0
+            and log_once("torch_predictor_not_using_gpu")
+        ):
+            logger.warning(
+                "You have `use_gpu` as False but there are "
+                f"{torch.cuda.device_count()} GPUs detected on host where "
+                "prediction will only use CPU. Please consider explicitly "
+                "setting `TorchPredictor(use_gpu=True)` or "
+                "`batch_predictor.predict(ds, num_gpus_per_worker=1)` to "
+                "enable GPU prediction."
+            )
+
+        super().__init__(preprocessor)
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(model={self.model!r}, "
+            f"preprocessor={self._preprocessor!r}, use_gpu={self.use_gpu!r})"
+        )
+
+    @classmethod
+    def from_checkpoint(
+        cls,
+        checkpoint: TorchCheckpoint,
+        model: Optional[torch.nn.Module] = None,
+        use_gpu: bool = False,
+    ) -> "TorchPredictor":
+        """Instantiate the predictor from a TorchCheckpoint.
+
+        Args:
+            checkpoint: The checkpoint to load the model and preprocessor from.
+            model: If the checkpoint contains a model state dict, and not
+                the model itself, then the state dict will be loaded to this
+                ``model``. If the checkpoint already contains the model itself,
+                this model argument will be discarded.
+            use_gpu: If set, the model will be moved to GPU on instantiation and
+                prediction happens on GPU.
+        """
+        model = checkpoint.get_model(model)
+        preprocessor = checkpoint.get_preprocessor()
+        return cls(model=model, preprocessor=preprocessor, use_gpu=use_gpu)
+
+    @DeveloperAPI
+    def call_model(
+        self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]]
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Runs inference on a single batch of tensor data.
+
+        This method is called by `TorchPredictor.predict` after converting the
+        original data batch to torch tensors.
+
+        Override this method to add custom logic for processing the model input or
+        output.
+
+        Args:
+            inputs: A batch of data to predict on, represented as either a single
+                PyTorch tensor or for multi-input models, a dictionary of tensors.
+
+        Returns:
+            The model outputs, either as a single tensor or a dictionary of tensors.
+
+        Example:
+
+            .. testcode::
+
+                import numpy as np
+                import torch
+                from ray.train.torch import TorchPredictor
+
+                # List outputs are not supported by default TorchPredictor.
+                # So let's define a custom TorchPredictor and override call_model
+                class MyModel(torch.nn.Module):
+                    def forward(self, input_tensor):
+                        return [input_tensor, input_tensor]
+
+                # Use a custom predictor to format model output as a dict.
+                class CustomPredictor(TorchPredictor):
+                    def call_model(self, inputs):
+                        model_output = super().call_model(inputs)
+                        return {
+                            str(i): model_output[i] for i in range(len(model_output))
+                        }
+
+                # create our data batch
+                data_batch = np.array([1, 2])
+                # create custom predictor and predict
+                predictor = CustomPredictor(model=MyModel())
+                predictions = predictor.predict(data_batch)
+                print(f"Predictions: {predictions.get('0')}, {predictions.get('1')}")
+
+            .. testoutput::
+
+                Predictions: [1 2], [1 2]
+
+        """
+        with torch.no_grad():
+            output = self.model(inputs)
+        return output
+
+    def predict(
+        self,
+        data: DataBatchType,
+        dtype: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None,
+    ) -> DataBatchType:
+        """Run inference on data batch.
+
+        If the provided data is a single array or a dataframe/table with a single
+        column, it will be converted into a single PyTorch tensor before being
+        inputted to the model.
+
+        If the provided data is a multi-column table or a dict of numpy arrays,
+        it will be converted into a dict of tensors before being inputted to the
+        model. This is useful for multi-modal inputs (for example your model accepts
+        both image and text).
+
+        Args:
+            data: A batch of input data of ``DataBatchType``.
+            dtype: The dtypes to use for the tensors. Either a single dtype for all
+                tensors or a mapping from column name to dtype.
+
+        Returns:
+            DataBatchType: Prediction result. The return type will be the same as the
+                input type.
+
+        Example:
+
+            .. testcode::
+
+                    import numpy as np
+                    import pandas as pd
+                    import torch
+                    import ray
+                    from ray.train.torch import TorchPredictor
+
+                    # Define a custom PyTorch module
+                    class CustomModule(torch.nn.Module):
+                        def __init__(self):
+                            super().__init__()
+                            self.linear1 = torch.nn.Linear(1, 1)
+                            self.linear2 = torch.nn.Linear(1, 1)
+
+                        def forward(self, input_dict: dict):
+                            out1 = self.linear1(input_dict["A"].unsqueeze(1))
+                            out2 = self.linear2(input_dict["B"].unsqueeze(1))
+                            return out1 + out2
+
+                    # Set manul seed so we get consistent output
+                    torch.manual_seed(42)
+
+                    # Use Standard PyTorch model
+                    model = torch.nn.Linear(2, 1)
+                    predictor = TorchPredictor(model=model)
+                    # Define our data
+                    data = np.array([[1, 2], [3, 4]])
+                    predictions = predictor.predict(data, dtype=torch.float)
+                    print(f"Standard model predictions: {predictions}")
+                    print("---")
+
+                    # Use Custom PyTorch model with TorchPredictor
+                    predictor = TorchPredictor(model=CustomModule())
+                    # Define our data and predict Customer model with TorchPredictor
+                    data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+                    predictions = predictor.predict(data, dtype=torch.float)
+                    print(f"Custom model predictions: {predictions}")
+
+            .. testoutput::
+
+                Standard model predictions: {'predictions': array([[1.5487633],
+                       [3.8037925]], dtype=float32)}
+                ---
+                Custom model predictions:     predictions
+                0  [0.61623406]
+                1    [2.857038]
+        """
+        return super(TorchPredictor, self).predict(data=data, dtype=dtype)
+
+    def _arrays_to_tensors(
+        self,
+        numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]],
+        dtype: Optional[Union[torch.dtype, Dict[str, torch.dtype]]],
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        return convert_ndarray_batch_to_torch_tensor_batch(
+            numpy_arrays,
+            dtypes=dtype,
+            device=self.device,
+        )
+
+    def _tensor_to_array(self, tensor: torch.Tensor) -> np.ndarray:
+        if not isinstance(tensor, torch.Tensor):
+            raise ValueError(
+                "Expected the model to return either a torch.Tensor or a "
+                f"dict of torch.Tensor, but got {type(tensor)} instead. "
+                f"To support models with different output types, subclass "
+                f"TorchPredictor and override the `call_model` method to "
+                f"process the output into either torch.Tensor or Dict["
+                f"str, torch.Tensor]."
+            )
+        return tensor.cpu().detach().numpy()
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8df6a6cdbe80016c528bd5a1ac963af86eb9ff64
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_trainer.py
@@ -0,0 +1,204 @@
+from typing import Any, Callable, Dict, Optional, Union
+
+from ray.train import Checkpoint, DataConfig, RunConfig, ScalingConfig
+from ray.train.data_parallel_trainer import DataParallelTrainer
+from ray.train.torch.config import TorchConfig
+from ray.train.trainer import GenDataset
+from ray.util import PublicAPI
+
+
+@PublicAPI(stability="stable")
+class TorchTrainer(DataParallelTrainer):
+    """A Trainer for data parallel PyTorch training.
+
+    At a high level, this Trainer does the following:
+
+    1. Launches multiple workers as defined by the ``scaling_config``.
+    2. Sets up a distributed PyTorch environment
+       on these workers as defined by the ``torch_config``.
+    3. Ingests the input ``datasets`` based on the ``dataset_config``.
+    4. Runs the input ``train_loop_per_worker(train_loop_config)``
+       on all workers.
+
+    For more details, see:
+
+    * :ref:`PyTorch Guide <train-pytorch>`
+    * :ref:`PyTorch Lightning Guide <train-pytorch-lightning>`
+    * :ref:`Hugging Face Transformers Guide <train-pytorch-transformers>`
+
+    Example:
+
+        .. testcode::
+
+            import os
+            import tempfile
+
+            import torch
+            from torch import nn
+            from torch.nn.parallel import DistributedDataParallel
+
+            import ray
+            from ray.train import Checkpoint, CheckpointConfig, RunConfig, ScalingConfig
+            from ray.train.torch import TorchTrainer
+
+            # If using GPUs, set this to True.
+            use_gpu = False
+            # Number of processes to run training on.
+            num_workers = 4
+
+            # Define your network structure.
+            class NeuralNetwork(nn.Module):
+                def __init__(self):
+                    super(NeuralNetwork, self).__init__()
+                    self.layer1 = nn.Linear(1, 32)
+                    self.relu = nn.ReLU()
+                    self.layer2 = nn.Linear(32, 1)
+
+                def forward(self, input):
+                    return self.layer2(self.relu(self.layer1(input)))
+
+            # Training loop.
+            def train_loop_per_worker(config):
+
+                # Read configurations.
+                lr = config["lr"]
+                batch_size = config["batch_size"]
+                num_epochs = config["num_epochs"]
+
+                # Fetch training dataset.
+                train_dataset_shard = ray.train.get_dataset_shard("train")
+
+                # Instantiate and prepare model for training.
+                model = NeuralNetwork()
+                model = ray.train.torch.prepare_model(model)
+
+                # Define loss and optimizer.
+                loss_fn = nn.MSELoss()
+                optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+
+                # Create data loader.
+                dataloader = train_dataset_shard.iter_torch_batches(
+                    batch_size=batch_size, dtypes=torch.float
+                )
+
+                # Train multiple epochs.
+                for epoch in range(num_epochs):
+
+                    # Train epoch.
+                    for batch in dataloader:
+                        output = model(batch["input"])
+                        loss = loss_fn(output, batch["label"])
+                        optimizer.zero_grad()
+                        loss.backward()
+                        optimizer.step()
+
+                    # Create checkpoint.
+                    base_model = (model.module
+                        if isinstance(model, DistributedDataParallel) else model)
+                    checkpoint_dir = tempfile.mkdtemp()
+                    torch.save(
+                        {"model_state_dict": base_model.state_dict()},
+                        os.path.join(checkpoint_dir, "model.pt"),
+                    )
+                    checkpoint = Checkpoint.from_directory(checkpoint_dir)
+
+                    # Report metrics and checkpoint.
+                    ray.train.report({"loss": loss.item()}, checkpoint=checkpoint)
+
+
+            # Define configurations.
+            train_loop_config = {"num_epochs": 20, "lr": 0.01, "batch_size": 32}
+            scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)
+            run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=1))
+
+            # Define datasets.
+            train_dataset = ray.data.from_items(
+                [{"input": [x], "label": [2 * x + 1]} for x in range(2000)]
+            )
+            datasets = {"train": train_dataset}
+
+            # Initialize the Trainer.
+            trainer = TorchTrainer(
+                train_loop_per_worker=train_loop_per_worker,
+                train_loop_config=train_loop_config,
+                scaling_config=scaling_config,
+                run_config=run_config,
+                datasets=datasets
+            )
+
+            # Train the model.
+            result = trainer.fit()
+
+            # Inspect the results.
+            final_loss = result.metrics["loss"]
+
+        .. testoutput::
+            :hide:
+
+            ...
+
+    Args:
+
+        train_loop_per_worker: The training function to execute on each worker.
+            This function can either take in zero arguments or a single ``Dict``
+            argument which is set by defining ``train_loop_config``.
+            Within this function you can use any of the
+            :ref:`Ray Train Loop utilities <train-loop-api>`.
+        train_loop_config: A configuration ``Dict`` to pass in as an argument to
+            ``train_loop_per_worker``.
+            This is typically used for specifying hyperparameters. Passing large
+            datasets via `train_loop_config` is not recommended and may introduce
+            large overhead and unknown issues with serialization and deserialization.
+        torch_config: The configuration for setting up the PyTorch Distributed backend.
+            If set to None, a default configuration will be used in which
+            GPU training uses NCCL and CPU training uses Gloo.
+        scaling_config: The configuration for how to scale data parallel training.
+            ``num_workers`` determines how many Python processes are used for training,
+            and ``use_gpu`` determines whether or not each process should use GPUs.
+            See :class:`~ray.train.ScalingConfig` for more info.
+        run_config: The configuration for the execution of the training run.
+            See :class:`~ray.train.RunConfig` for more info.
+        datasets: The Ray Datasets to ingest for training.
+            Datasets are keyed by name (``{name: dataset}``).
+            Each dataset can be accessed from within the ``train_loop_per_worker``
+            by calling ``ray.train.get_dataset_shard(name)``.
+            Sharding and additional configuration can be done by
+            passing in a ``dataset_config``.
+        dataset_config: The configuration for ingesting the input ``datasets``.
+            By default, all the Ray Dataset are split equally across workers.
+            See :class:`~ray.train.DataConfig` for more details.
+        resume_from_checkpoint: A checkpoint to resume training from.
+            This checkpoint can be accessed from within ``train_loop_per_worker``
+            by calling ``ray.train.get_checkpoint()``.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        torch_config: Optional[TorchConfig] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        dataset_config: Optional[DataConfig] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        if not torch_config:
+            torch_config = TorchConfig()
+
+        super(TorchTrainer, self).__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=torch_config,
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/train_loop_utils.py b/.venv/lib/python3.11/site-packages/ray/train/torch/train_loop_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..465eed45a4a894f301ae6d585bfdbeac3839d6e4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/train_loop_utils.py
@@ -0,0 +1,774 @@
+import collections
+import logging
+import os
+import random
+import types
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from packaging.version import Version
+from torch.cuda.amp import GradScaler, autocast
+from torch.nn.parallel import DistributedDataParallel
+from torch.optim import Optimizer
+from torch.utils.data import (
+    DataLoader,
+    DistributedSampler,
+    IterableDataset,
+    RandomSampler,
+    SequentialSampler,
+)
+
+from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
+from ray.air._internal.device_manager import (
+    get_torch_device_manager_by_context,
+    get_torch_device_manager_by_device_type,
+)
+from ray.train._internal import session
+from ray.train._internal.accelerator import Accelerator
+from ray.train._internal.session import get_accelerator, set_accelerator
+from ray.util.annotations import Deprecated, PublicAPI
+
+if Version(torch.__version__) < Version("1.11.0"):
+    FullyShardedDataParallel = None
+else:
+    from torch.distributed.fsdp import FullyShardedDataParallel
+
+try:
+    from torch.profiler import profile
+except ImportError:
+    profile = None
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI(stability="stable")
+def get_device() -> torch.device:
+    """Gets the correct torch device configured for this process.
+
+    Returns the torch device for the current worker. If more than 1 GPU is
+    requested per worker, returns the device with the minimal device index.
+
+    .. note::
+
+        If you requested multiple GPUs per worker, and want to get
+        the full list of torch devices, please use
+        :meth:`~ray.train.torch.get_devices`.
+
+    Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
+    superset of the `ray.get_gpu_ids()`.
+
+    Examples:
+
+        Example: Launched 2 workers on the current node, each with 1 GPU
+
+        .. testcode::
+            :skipif: True
+
+            os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+            ray.get_gpu_ids() == [2]
+            torch.cuda.is_available() == True
+            get_device() == torch.device("cuda:0")
+
+        Example: Launched 4 workers on the current node, each with 1 GPU
+
+        .. testcode::
+            :skipif: True
+
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+            ray.get_gpu_ids() == [2]
+            torch.cuda.is_available() == True
+            get_device() == torch.device("cuda:2")
+
+        Example: Launched 2 workers on the current node, each with 2 GPUs
+
+        .. testcode::
+            :skipif: True
+
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+            ray.get_gpu_ids() == [2,3]
+            torch.cuda.is_available() == True
+            get_device() == torch.device("cuda:2")
+
+
+        You can move a model to device by:
+
+        .. testcode::
+            :skipif: True
+
+            model.to(ray.train.torch.get_device())
+
+        Instead of manually checking the device type:
+
+        .. testcode::
+            :skipif: True
+
+            model.to("cuda" if torch.cuda.is_available() else "cpu")
+    """
+    from ray.air._internal import torch_utils
+
+    record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICE, "1")
+    return torch_utils.get_devices()[0]
+
+
+@PublicAPI(stability="beta")
+def get_devices() -> List[torch.device]:
+    """Gets the correct torch device list configured for this process.
+
+    Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
+    superset of the `ray.get_gpu_ids()`.
+
+
+    Examples:
+
+        Example: Launched 2 workers on the current node, each with 1 GPU
+
+        .. testcode::
+            :skipif: True
+
+            os.environ["CUDA_VISIBLE_DEVICES"] == "2,3"
+            ray.get_gpu_ids() == [2]
+            torch.cuda.is_available() == True
+            get_devices() == [torch.device("cuda:0")]
+
+        Example: Launched 4 workers on the current node, each with 1 GPU
+
+        .. testcode::
+            :skipif: True
+
+            os.environ["CUDA_VISIBLE_DEVICES"] == "0,1,2,3"
+            ray.get_gpu_ids() == [2]
+            torch.cuda.is_available() == True
+            get_devices() == [torch.device("cuda:2")]
+
+        Example: Launched 2 workers on the current node, each with 2 GPUs
+
+        .. testcode::
+            :skipif: True
+
+            os.environ["CUDA_VISIBLE_DEVICES"] == "0,1,2,3"
+            ray.get_gpu_ids() == [2,3]
+            torch.cuda.is_available() == True
+            get_devices() == [torch.device("cuda:2"), torch.device("cuda:3")]
+    """
+
+    from ray.air._internal import torch_utils
+
+    record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICES, "1")
+    return torch_utils.get_devices()
+
+
+@PublicAPI(stability="stable")
+def prepare_model(
+    model: torch.nn.Module,
+    move_to_device: Union[bool, torch.device] = True,
+    parallel_strategy: Optional[str] = "ddp",
+    parallel_strategy_kwargs: Optional[Dict[str, Any]] = None,
+) -> torch.nn.Module:
+    """Prepares the model for distributed execution.
+
+    This allows you to use the same exact code regardless of number of
+    workers or the device type being used (CPU, GPU).
+
+    Args:
+        model (torch.nn.Module): A torch model to prepare.
+        move_to_device: Either a boolean indiciating whether to move
+            the model to the correct device or an actual device to
+            move the model to. If set to False, the model needs
+            to manually be moved to the correct device.
+        parallel_strategy ("ddp", "fsdp", or None): Whether to wrap models
+            in ``DistributedDataParallel``, ``FullyShardedDataParallel``,
+            or neither.
+        parallel_strategy_kwargs (Dict[str, Any]): Args to pass into
+            ``DistributedDataParallel`` or ``FullyShardedDataParallel``
+            initialization if ``parallel_strategy`` is set to "ddp"
+            or "fsdp", respectively.
+    """
+
+    if parallel_strategy == "fsdp" and FullyShardedDataParallel is None:
+        raise ImportError(
+            "FullyShardedDataParallel requires torch>=1.11.0. "
+            "Run `pip install 'torch>=1.11.0'` to use FullyShardedDataParallel."
+        )
+
+    record_extra_usage_tag(TagKey.TRAIN_TORCH_PREPARE_MODEL, "1")
+    return get_accelerator(_TorchAccelerator).prepare_model(
+        model,
+        move_to_device=move_to_device,
+        parallel_strategy=parallel_strategy,
+        parallel_strategy_kwargs=parallel_strategy_kwargs,
+    )
+
+
+@PublicAPI(stability="stable")
+def prepare_data_loader(
+    data_loader: torch.utils.data.DataLoader,
+    add_dist_sampler: bool = True,
+    move_to_device: bool = True,
+    auto_transfer: bool = True,
+) -> torch.utils.data.DataLoader:
+    """Prepares :class:`~torch.utils.data.DataLoader` for distributed execution.
+
+    This allows you to use the same exact code regardless of number of
+    workers or the device type being used (CPU, GPU).
+
+    .. note::
+
+        This method adds a `DistributedSampler` to the `DataLoader` if the
+        number of training workers is greater than 1. If shuffling is
+        enabled on the original `DataLoader`, then `shuffle=True` will also
+        be passed into the `DistributedSampler` constructor. `shuffle=False`
+        on the original `DataLoader` also means that shuffling is disabled
+        on the sampler.
+
+        With more than 1 worker, calling the `DistributedSampler.set_epoch` method
+        at the beginning of each epoch before creating the DataLoader iterator
+        is necessary to make shuffling work properly across multiple epochs.
+        Otherwise, the same ordering will be always used.
+        See: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler  # noqa: E501
+
+    Example:
+
+    .. testcode:
+        :skipif: True
+
+        import torch
+
+        import ray.train.torch
+
+        train_dataloader = torch.utils.data.DataLoader(
+            ..., batch_size=..., shuffle=True
+        )
+        train_dataloader = ray.train.torch.prepare_data_loader(train_loader)
+
+        for epoch in range(10):
+            if ray.train.get_context().get_world_size() > 1:
+                # Required for the distributed sampler to shuffle properly across epochs
+                train_dataloader.sampler.set_epoch(epoch)
+
+            for X, y in train_loader:
+                # No need to move data to GPU, this is done by `prepare_data_loader`!
+                # X, y = X.to("cuda"), y.to("cuda")
+                ...
+
+    Args:
+        data_loader (torch.utils.data.DataLoader): The DataLoader to
+            prepare.
+        add_dist_sampler: Whether to add a DistributedSampler to
+            the provided DataLoader.
+        move_to_device: If set, automatically move the data
+            returned by the data loader to the correct device.
+        auto_transfer: If set and device is GPU, another CUDA stream
+            is created to automatically copy data from host (CPU) memory
+            to device (GPU) memory (the default CUDA stream still runs the
+            training procedure). If device is CPU, it will be disabled
+            regardless of the setting. This configuration will be ignored
+            if ``move_to_device`` is False.
+    """
+    record_extra_usage_tag(TagKey.TRAIN_TORCH_PREPARE_DATALOADER, "1")
+    return get_accelerator(_TorchAccelerator).prepare_data_loader(
+        data_loader,
+        add_dist_sampler=add_dist_sampler,
+        move_to_device=move_to_device,
+        auto_transfer=auto_transfer,
+    )
+
+
+@PublicAPI(stability="beta")
+def accelerate(amp: bool = False) -> None:
+    """Enables training optimizations.
+
+    Arguments:
+        amp: If true, perform training with automatic mixed precision.
+            Otherwise, use full precision.
+
+    .. warning:: ``train.torch.accelerate`` cannot be called more than once, and it
+       must be called before any other ``train.torch`` utility function.
+    """
+    try:
+        set_accelerator(_TorchAccelerator(amp=amp))
+    except RuntimeError:
+        raise RuntimeError(
+            "An accelerator has already been set. Make sure "
+            "`train.torch.accelerate()` is not called multiple times, and is called "
+            "before any of the prepare methods."
+        )
+
+
+@PublicAPI(stability="beta")
+def prepare_optimizer(optimizer: torch.optim.Optimizer) -> torch.optim.Optimizer:
+    """Wraps optimizer to support automatic mixed precision.
+
+    Args:
+        optimizer (torch.optim.Optimizer): The DataLoader to prepare.
+
+    Returns:
+        A wrapped optimizer.
+    """
+    return get_accelerator(_TorchAccelerator).prepare_optimizer(optimizer)
+
+
+@PublicAPI(stability="beta")
+def backward(tensor: torch.Tensor) -> None:
+    """Computes the gradient of the specified tensor w.r.t. graph leaves.
+
+    Args:
+        tensor (torch.Tensor): Tensor of which the derivative will be computed.
+    """
+    get_accelerator(_TorchAccelerator).backward(tensor)
+
+
+@PublicAPI(stability="stable")
+def enable_reproducibility(seed: int = 0) -> None:
+    """Limits sources of nondeterministic behavior.
+
+    This function:
+
+        * Seeds PyTorch, Python, and NumPy.
+        * Disables CUDA convolution benchmarking.
+        * Configures PyTorch to use determinstic algorithms.
+        * Seeds workers spawned for multi-process data loading.
+
+    Args:
+        seed: The number to seed libraries and data workers with.
+
+    .. warning:: ``train.torch.enable_reproducibility()`` can't guarantee
+        completely reproducible results across executions. To learn more, read
+        the `PyTorch notes on randomness
+        <https://pytorch.org/docs/stable/notes/randomness.html>`_.
+    """
+    get_accelerator(_TorchAccelerator).enable_reproducibility(seed)
+
+
+@Deprecated
+class TorchWorkerProfiler:
+    """Utility class for running PyTorch Profiler on a Train worker.
+
+    Args:
+        trace_dir (Optional[str]): The directory to store traces on the
+           worker node. If ``None``, this will use a default temporary dir.
+    """
+
+    WORKER_TRACE_DIR_NAME = "pytorch_profiler_worker_traces"
+
+    def __init__(self, trace_dir: Optional[str] = None):
+        raise DeprecationWarning(
+            "The `ray.train.torch.TorchWorkerProfiler` API is deprecated in Ray 2.0.",
+        )
+
+
+class _TorchAccelerator(Accelerator):
+    """A utility that implements methods to accelerate PyTorch training.
+
+    Arguments:
+        amp: If true, perform training with automatic mixed precision.
+            Otherwise, use full precision.
+    """
+
+    def __init__(self, amp: bool = False):
+        self.amp_is_enabled = amp
+        self.scaler = GradScaler() if amp else None
+        self._seed = None
+        self.device_manager = get_torch_device_manager_by_context()
+
+    def prepare_model(
+        self,
+        model: torch.nn.Module,
+        move_to_device: bool = True,
+        parallel_strategy: Optional[str] = "ddp",
+        parallel_strategy_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.nn.Module:
+        """Prepares the model for distributed execution.
+
+        This allows you to use the same exact code regardless of number of
+        workers or the device type being used (CPU, GPU).
+
+        Args:
+            model (torch.nn.Module): A torch model to prepare.
+            move_to_device: Whether to move the model to the correct
+                device. If set to False, the model needs to manually be moved
+                to the correct device.
+            parallel_strategy ("ddp", "fsdp", or None): Whether to wrap models
+                in ``DistributedDataParallel``, ``FullyShardedDataParallel`` (
+                Experimental), or neither.
+            parallel_strategy_kwargs (Dict[str, Any]): Args to pass into
+                ``DistributedDataParallel`` or ``FullyShardedDataParallel``
+                initialization if ``parallel_strategy`` is set to "ddp"
+                or "fsdp", respectively.
+        """
+        parallel_strategy_kwargs = parallel_strategy_kwargs or {}
+
+        rank = session.get_local_rank()
+
+        if isinstance(move_to_device, torch.device):
+            device = move_to_device
+        else:
+            device = get_device()
+            if isinstance(device, list):
+                device = device[0]
+
+        if self.device_manager.is_available():
+            self.device_manager.set_device(device)
+
+        if move_to_device:
+            if rank == 0:
+                logger.info(f"Moving model to device: {device}")
+            else:
+                logger.debug(f"Moving model to device: {device}")
+            model = model.to(device)
+
+        def model_get_state(self):
+            # `__getstate__` is an special method that informs pickle which attributes
+            # to serialize. This custom implementation ensures that the wrapped forward
+            # method and custom `__getstate__` method aren't serialized.
+            if hasattr(self, "_original_get_state"):
+                state = self._original_get_state()
+                state["__getstate__"] = state["_original_get_state"]
+                del state["_original_get_state"]
+            else:
+                # If model does not have a `__getstate__` already defined, use default
+                # implementation.
+                state = self.__dict__.copy()
+                del state["__getstate__"]
+            state["forward"] = state["_unwrapped_forward"]
+            del state["_unwrapped_forward"]
+
+            return state
+
+        if self.amp_is_enabled:
+            # Pickle cannot serialize the wrapped forward method. As a workaround,
+            # define a custom `__getstate__` method that unwraps the forward method.
+            model._unwrapped_forward = model.forward
+            model.forward = autocast()(model.forward)
+
+            # TODO(amogkam): Replace below logic with a generic "unpack model" method.
+            # Replacing the `model.forward` method makes the model no longer
+            # serializable. When serializing the model, we have to override the
+            # `__getstate__` method to set back the original forward method.
+            if hasattr(model, "__getstate__"):
+                model._original_get_state = model.__getstate__
+            # `__getstate__` must be a bound method rather than an callable attribute.
+            # See https://stackoverflow.com/questions/972/adding-a-method-to-an-existing-object-instance.  # noqa: E501
+            model.__getstate__ = types.MethodType(model_get_state, model)
+
+        world_size = session.get_world_size()
+
+        if parallel_strategy and world_size > 1:
+            if parallel_strategy == "ddp":
+                DataParallel = DistributedDataParallel
+                if self.device_manager.is_available() and device.type != "cpu":
+                    parallel_strategy_kwargs = {
+                        "device_ids": [device],
+                        "output_device": device,
+                        **parallel_strategy_kwargs,
+                    }
+            else:
+                if not torch.cuda.is_available():
+                    raise RuntimeError(
+                        "FSDP is only available with GPU-enabled "
+                        "training. Set "
+                        "`use_gpu=True` in your Trainer to train with "
+                        "GPUs."
+                    )
+                DataParallel = FullyShardedDataParallel
+            if rank == 0:
+                logger.info(f"Wrapping provided model in {DataParallel.__name__}.")
+            else:
+                logger.debug(f"Wrapping provided model in {DataParallel.__name__}.")
+            model = DataParallel(model, **parallel_strategy_kwargs)
+
+        return model
+
+    def prepare_data_loader(
+        self,
+        data_loader: torch.utils.data.DataLoader,
+        add_dist_sampler: bool = True,
+        move_to_device: bool = True,
+        auto_transfer: bool = False,
+    ) -> torch.utils.data.DataLoader:
+        """Prepares DataLoader for distributed execution.
+
+        This allows you to use the same exact code regardless of number of
+        workers or the device type being used (CPU, GPU).
+
+        Args:
+            data_loader (torch.utils.data.DataLoader): The DataLoader to
+                prepare.
+            add_dist_sampler: Whether to add a DistributedSampler to
+                the provided DataLoader.
+            move_to_device: If set, automatically move the data
+                returned by the data loader to the correct device.
+            auto_transfer: (Experimental) If set and device is GPU, another CUDA stream
+                is created to automatically copy data from host (CPU) memory
+                to device (GPU) memory (the default CUDA stream still runs the
+                training procedure). If device is CPU, it will be disabled
+                regardless of the setting. This configuration will be ignored
+                if ``move_to_device`` is False.
+        """
+
+        world_size = session.get_world_size()
+        world_rank = session.get_world_rank()
+
+        # Only add Distributed Sampler if the following conditions hold:
+        # 1. More than one training worker is being used.
+        # 2. A DistributedSampler has not already been added by the user.
+        # 3. The dataset is not an IterableDataset. Samplers do not worker with
+        # IterableDatasets.
+        if (
+            world_size > 1
+            and not isinstance(data_loader.sampler, DistributedSampler)
+            and not (
+                hasattr(data_loader, "dataset")
+                and isinstance(data_loader.dataset, IterableDataset)
+            )
+            and add_dist_sampler
+        ):
+
+            def with_sampler(loader):
+                # Automatically set the DistributedSampler
+
+                # If you're using a sampler, the DataLoader shuffle flag must be set to
+                # False. Shuffling is instead determined by the shuffle argument passed
+                # to the DistributedSampler constructor.
+
+                # If no sampler is passed to the DataLoader constructor, Torch
+                # constructs a default sampler. The default sampler is a RandomSampler
+                # if shuffling is enabled and a SequentialSampler otherwise. DataLoader
+                # does not have a shuffle attribute, so we instead identify whether
+                # shuffling is enabled by checking the default sampler type.
+                shuffle = not isinstance(loader.sampler, SequentialSampler)
+
+                def seeded_worker_init_fn(
+                    worker_init_fn: Optional[Callable[[int], None]]
+                ):
+                    def wrapper(worker_id: int):
+                        worker_seed = torch.initial_seed() % 2**32
+                        np.random.seed(worker_seed)
+                        random.seed(worker_seed)
+                        if worker_init_fn:
+                            worker_init_fn(worker_id)
+
+                    return wrapper
+
+                worker_init_fn: Optional[Callable[[int], None]] = loader.worker_init_fn
+                generator: Optional[torch.Generator] = loader.generator
+                if self._seed is not None:
+                    worker_init_fn = seeded_worker_init_fn(worker_init_fn)
+                    generator = torch.Generator()
+                    generator.manual_seed(self._seed)
+
+                using_default_sampler = isinstance(
+                    loader.sampler, (SequentialSampler, RandomSampler)
+                )
+                if not using_default_sampler and world_rank == 0:
+                    logger.warn(
+                        f"The {loader.sampler.__class__.__name__} will be overwritten "
+                        "with a DistributedSampler. You can disable this by setting "
+                        "`with_sampler` to False in `prepare_data_loader`."
+                    )
+
+                data_loader_args = {
+                    "dataset": loader.dataset,
+                    "batch_size": loader.batch_size,
+                    "shuffle": False,
+                    "num_workers": loader.num_workers,
+                    "collate_fn": loader.collate_fn,
+                    "pin_memory": loader.pin_memory,
+                    "drop_last": loader.drop_last,
+                    "timeout": loader.timeout,
+                    "worker_init_fn": worker_init_fn,
+                    "generator": generator,
+                    "sampler": DistributedSampler(loader.dataset, shuffle=shuffle),
+                }
+                return DataLoader(**data_loader_args)
+
+            data_loader = with_sampler(data_loader)
+
+        if move_to_device:
+            device = get_device()
+            data_loader = _WrappedDataLoader(data_loader, device, auto_transfer)
+
+        return data_loader
+
+    def prepare_optimizer(self, optimizer: Optimizer) -> Optimizer:
+        """Wraps optimizer to support automatic mixed precision.
+
+        Args:
+            optimizer (torch.optim.Optimizer): The DataLoader to prepare.
+
+        Returns:
+            A wrapped optimizer.
+        """
+        return _WrappedOptimizer(optimizer, scaler=self.scaler)
+
+    def backward(self, tensor: torch.Tensor) -> None:
+        """Computes the gradient of the specified tensor w.r.t. graph leaves.
+
+        Args:
+            tensor (torch.Tensor): Tensor of which the derivative will be computed.
+        """
+        if self.amp_is_enabled:
+            self.scaler.scale(tensor).backward()
+        else:
+            tensor.backward()
+
+    def enable_reproducibility(self, seed: int = 0) -> None:
+        """Limits sources of nondeterministic behavior."""
+        self._seed = seed
+
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+
+        torch.use_deterministic_algorithms(True)
+        torch.backends.cudnn.benchmark = False
+
+        # If you want to use deterministic algorithms with CUDA, then you need to set
+        # the CUBLAS_WORKSPACE_CONFIG environment variable; otherwise, Torch errors.
+        # See https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility.
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+
+class _WrappedDataLoader(DataLoader):
+    def __init__(
+        self, base_dataloader: DataLoader, device: torch.device, auto_transfer: bool
+    ):
+        self.__dict__.update(getattr(base_dataloader, "__dict__", {}))
+        self._dataloader = base_dataloader
+        self.dataloader_iter = None
+        self.device = device
+
+        self.device_manager = get_torch_device_manager_by_device_type(device.type)
+
+        # disable auto transfer (host->device) if cpu is used
+        if device.type != "cpu" and self.device_manager.supports_stream():
+            self._auto_transfer = auto_transfer
+        else:
+            self._auto_transfer = False
+        # create a new device stream to move data from host to device concurrently
+        self._memcpy_stream = (
+            self.device_manager.create_stream(device)
+            if device.type != "cpu" and self._auto_transfer
+            else None
+        )
+        self.next_batch = None
+
+    def _move_to_device(self, item):
+        if item is None:
+            return None
+
+        def try_move_device(i):
+            try:
+                i = i.to(self.device, non_blocking=self._auto_transfer)
+            except AttributeError:
+                logger.debug(f"Item {i} cannot be moved to device " f"{self.device}.")
+            return i
+
+        with self.device_manager.get_stream_context(self._memcpy_stream):
+            if isinstance(item, collections.abc.Mapping):
+                item_on_device = {k: self._move_to_device(v) for k, v in item.items()}
+            elif isinstance(item, tuple):
+                item_on_device = tuple(self._move_to_device(i) for i in item)
+            elif isinstance(item, list):
+                item_on_device = [self._move_to_device(i) for i in item]
+            elif isinstance(item, torch.Tensor):
+                item_on_device = try_move_device(item)
+            else:
+                logger.debug(
+                    f"Data type {type(item)} doesn't support being moved to device."
+                )
+                item_on_device = item
+
+            return item_on_device
+
+    def _wait_for_batch(self, item):
+        if self._memcpy_stream is None:
+            return
+        # Reference:
+        # https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html
+        # The training stream (current) needs to wait until
+        # the memory copy stream finishes.
+        curr_stream = self.device_manager.get_current_stream()
+        curr_stream.wait_stream(self._memcpy_stream)
+        # When a tensor is used by CUDA streams different from
+        # its original allocator, we need to call ``record_stream``
+        # to inform the allocator of all these streams. Otherwise,
+        # the tensor might be freed once it is no longer used by
+        # the creator stream.
+        for i in item:
+            # The Pytorch DataLoader has no restrictions on what is outputted for
+            # each batch. We should only ``record_stream`` if the item has the
+            # ability to do so.
+            try:
+                i.record_stream(curr_stream)
+            except AttributeError:
+                pass
+
+    def __len__(self):
+        return len(self._dataloader)
+
+    def _prefetch_next_batch(self):
+        next_batch = next(self.dataloader_iter, None)
+        self.next_batch = self._move_to_device(next_batch)
+
+    def __iter__(self):
+        self.dataloader_iter = iter(self._dataloader)
+        self._prefetch_next_batch()
+        return self
+
+    def __next__(self):
+        next_batch = self.next_batch
+        if next_batch is None:
+            raise StopIteration
+        self._wait_for_batch(next_batch)
+        self._prefetch_next_batch()
+        return next_batch
+
+
+class _WrappedOptimizer(Optimizer):
+    def __init__(self, optimizer: Optimizer, scaler: Optional[GradScaler] = None):
+        self.optimizer = optimizer
+        self.scaler = scaler
+
+    @property
+    def state(self):
+        return self.optimizer.state
+
+    @state.setter
+    def state(self, state):
+        self.optimizer.state = state
+
+    @property
+    def param_groups(self):
+        return self.optimizer.param_groups
+
+    @param_groups.setter
+    def param_groups(self, param_groups):
+        self.optimizer.param_groups = param_groups
+
+    @property
+    def defaults(self):
+        return self.optimizer.defaults
+
+    @defaults.setter
+    def defaults(self, defaults):
+        self.optimizer.defaults = defaults
+
+    def add_param_group(self, param_group):
+        self.optimizer.add_param_group(param_group)
+
+    def load_state_dict(self, state_dict):
+        self.optimizer.load_state_dict(state_dict)
+
+    def state_dict(self):
+        return self.optimizer.state_dict()
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+    def step(self, closure=None):
+        if self.scaler is not None:
+            self.scaler.step(self.optimizer, closure)
+            self.scaler.update()
+        else:
+            self.optimizer.step(closure)
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea32abc8c9d7b5514b3c5680e06ee0ab334eb4eb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__init__.py
@@ -0,0 +1,5 @@
+from ray.train.torch.xla.config import TorchXLAConfig
+
+__all__ = [
+    "TorchXLAConfig",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..456f68323a5adb487aab3f89cbc4a91158e08eff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b814a756106585ff5ddc2d962789f33338eeb3e1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/config.py b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e965f9fc269acbc19dded2001051bf350ddf80f1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/config.py
@@ -0,0 +1,169 @@
+import logging
+import os
+import re
+import shutil
+import uuid
+from dataclasses import dataclass
+
+import ray
+from ray.train._internal.utils import get_address_and_port
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import Backend
+from ray.train.torch import TorchConfig
+from ray.util import PublicAPI
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI(stability="alpha")
+@dataclass
+class TorchXLAConfig(TorchConfig):
+    """
+    Configuration for torch XLA setup.
+    See https://pytorch.org/xla/release/1.13/index.html for more info.
+    Currently, only "neuron_cores" accelerator (AwsNeuronXLABackend)
+    is supported with xrt runtime.
+    """
+
+    neuron_parallel_compile: bool = False
+
+    @property
+    def backend_cls(self):
+        return _TorchAwsNeuronXLABackend
+
+
+def _kill_xrt_server():
+    import subprocess
+
+    subprocess.call(["pkill", "-f", "xrt_run_server"])
+
+
+def _set_xla_env_vars():
+    # https://pytorch.org/docs/1.13/elastic/run.html#environment-variables
+    context = ray.train.get_context()
+
+    os.environ["LOCAL_RANK"] = str(context.get_local_rank())
+    os.environ["RANK"] = str(context.get_world_rank())
+    os.environ["LOCAL_WORLD_SIZE"] = str(context.get_local_world_size())
+    os.environ["WORLD_SIZE"] = str(context.get_world_size())
+    os.environ["GROUP_RANK"] = str(context.get_node_rank())
+    os.environ["GROUP_WORLD_SIZE"] = str(
+        context.get_world_size() / context.get_local_world_size()
+    )
+    os.environ["ROLE_RANK"] = str(context.get_world_rank())
+    os.environ["ROLE_WORLD_RANK"] = str(context.get_world_rank())
+    os.environ["ROLE_WORLD_SIZE"] = str(context.get_world_size())
+
+    # EFA and XLA setup
+    # https://github.com/aws/libfabric/blob/master/prov/efa/src/rxr/rxr_init.c
+    # https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128.sh # noqa
+    os.environ["FI_PROVIDER"] = "efa"
+    os.environ["FI_EFA_USE_DEVICE_RDMA"] = "1"
+    os.environ["FI_EFA_FORK_SAFE"] = "1"
+    os.environ["XLA_TRANSFER_SEED_ASYNC"] = "1"
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
+
+
+def _setup_xla_torch_process_group():
+    try:
+        import torch.distributed as dist
+        import torch_xla.core.xla_model as xm  # noqa F401
+        import torch_xla.distributed.xla_backend  # noqa F401
+
+        dist.init_process_group("xla")
+    except ImportError:
+        raise ImportError("torch_xla must be installed to use torch_xla backend.")
+
+
+# The following env vars enable Neuron graph extraction for parallel compilation
+#   Note: model outputs are invalid and should be ignored while these env vars are set
+def _set_neuron_parallel_compile_env_vars():
+    os.environ["NEURON_PARALLEL_COMPILE"] = "1"
+    os.environ["NEURON_EXTRACT_GRAPHS_ONLY"] = "1"
+    os.environ["NEURON_FALL_BACK_TO_NULL_NEFF"] = "1"
+
+
+# Compile previously extracted Neuron graphs
+def _neuron_compile_extracted_graphs():
+    try:
+        from libneuronxla.neuron_cc_cache import CacheUrl
+        from libneuronxla.neuron_parallel_compile import parallel_compile
+    except ImportError:
+        raise ImportError(
+            "libneuronxla must be installed to use Neuron parallel compilation."
+        )
+
+    # Only 1 worker per node should run parallel_compile()
+    if os.environ.get("LOCAL_RANK") == "0":
+        logger.info("Compiling extracted graphs on local rank0 worker")
+
+        parallel_compile_workdir = (
+            f"/tmp/{os.environ.get('USER','no-user')}/parallel_compile_workdir/"
+        )
+        if os.path.exists(parallel_compile_workdir):
+            shutil.rmtree(parallel_compile_workdir)
+        os.makedirs(parallel_compile_workdir, exist_ok=True)
+
+        # Users can set the cache directory using --cache_dir in NEURON_CC_FLAGS or by
+        # using NEURON_COMPILE_CACHE_URL. --cache_dir takes precedence.
+        explicit_cache_dir = None
+        if neuron_cc_flags := os.environ.get("NEURON_CC_FLAGS"):
+            if s := re.search(r"--cache_dir[= ](\S+)", neuron_cc_flags):
+                explicit_cache_dir = s.group(1)
+
+        parallel_compile(
+            parallel_compile_workdir,
+            CacheUrl.get_cache_url(explicit_cache_dir),
+        )
+
+
+class _TorchAwsNeuronXLABackend(Backend):
+    unique_run_id: str = str(uuid.uuid4())
+
+    def on_start(self, worker_group: WorkerGroup, backend_config: TorchXLAConfig):
+        """Logic ran right before training is started."""
+
+        # On previous worker failure, we don't run graceful shutdown on workers.
+        # This would leak any running xrt server.
+        worker_group.execute(_kill_xrt_server)
+
+        # Get master address and port from the first worker.
+        master_addr, master_port = worker_group.execute_single(0, get_address_and_port)
+
+        def set_env_vars(addr, port):
+            os.environ["MASTER_ADDR"] = addr
+            os.environ["MASTER_PORT"] = str(port)
+            # To trigger the xrt server
+            os.environ["TORCHELASTIC_RUN_ID"] = self.unique_run_id
+
+        # Set the env vars on all workers.
+        worker_group.execute(set_env_vars, addr=master_addr, port=master_port)
+
+        # Set up env vars for neuron parallel compilation graph extraction
+        if backend_config.neuron_parallel_compile:
+            logger.info("Extracting graphs for Neuron parallel compilation")
+            worker_group.execute(_set_neuron_parallel_compile_env_vars)
+
+    def on_training_start(
+        self, worker_group: WorkerGroup, backend_config: TorchXLAConfig
+    ):
+        """
+        Configure the environment variables for the worker group.
+        And initialize the xla distributed process group.
+        TODO: Current setup only supports homogenous cluster with
+         neuron_cores accelerator and xrt runtime.
+        """
+        worker_group.execute(_set_xla_env_vars)
+        worker_group.execute(_setup_xla_torch_process_group)
+
+    def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchXLAConfig):
+        """
+        Logic ran right after training is finished.
+        This is a sanity cleanup to kill xrt server, and to optionally
+        run neuron parallel graph compilation
+        """
+        worker_group.execute(_kill_xrt_server)
+
+        # Compile the extracted graphs. This must run at end of training.
+        if backend_config.neuron_parallel_compile:
+            worker_group.execute(_neuron_compile_extracted_graphs)
diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/lightgbm_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/lightgbm_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5a3988bd457bf7349c9981c51e1dc604809c155
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/lightgbm_trainer.py
@@ -0,0 +1,154 @@
+import logging
+from typing import Any, Callable, Dict, Optional, Union
+
+import ray.train
+from ray.train import Checkpoint
+from ray.train.lightgbm.config import LightGBMConfig, get_network_params  # noqa
+from ray.train.trainer import GenDataset
+from ray.train.v2._internal.constants import _UNSUPPORTED
+from ray.train.v2.api.config import RunConfig, ScalingConfig
+from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer
+
+logger = logging.getLogger(__name__)
+
+
+class LightGBMTrainer(DataParallelTrainer):
+    """A Trainer for distributed data-parallel LightGBM training.
+
+    Example
+    -------
+
+    .. testcode::
+
+        import lightgbm as lgb
+
+        import ray.data
+        import ray.train
+        from ray.train.lightgbm import RayTrainReportCallback
+        from ray.train.lightgbm.v2 import LightGBMTrainer
+
+
+        def train_fn_per_worker(config: dict):
+            # (Optional) Add logic to resume training state from a checkpoint.
+            # ray.train.get_checkpoint()
+
+            # 1. Get the dataset shard for the worker and convert to a `lgb.Dataset`
+            train_ds_iter, eval_ds_iter = (
+                ray.train.get_dataset_shard("train"),
+                ray.train.get_dataset_shard("validation"),
+            )
+            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
+            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
+            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
+            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
+
+            train_set = lgb.Dataset(train_X, label=train_y)
+            eval_set = lgb.Dataset(eval_X, label=eval_y)
+
+            # 2. Run distributed data-parallel training.
+            # `get_network_params` sets up the necessary configurations for LightGBM
+            # to set up the data parallel training worker group on your Ray cluster.
+            params = {
+                "objective": "regression",
+                # Adding the line below is the only change needed
+                # for your `lgb.train` call!
+                **ray.train.lightgbm.v2.get_network_params(),
+            }
+            lgb.train(
+                params,
+                train_set,
+                valid_sets=[eval_set],
+                valid_names=["eval"],
+                # To access the checkpoint from trainer, you need this callback.
+                callbacks=[RayTrainReportCallback()],
+            )
+
+        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+        eval_ds = ray.data.from_items(
+            [{"x": x, "y": x + 1} for x in range(32, 32 + 16)]
+        )
+        trainer = LightGBMTrainer(
+            train_fn_per_worker,
+            datasets={"train": train_ds, "validation": eval_ds},
+            scaling_config=ray.train.ScalingConfig(num_workers=4),
+        )
+        result = trainer.fit()
+        booster = RayTrainReportCallback.get_model(result.checkpoint)
+
+    .. testoutput::
+        :hide:
+
+        ...
+
+    Args:
+        train_loop_per_worker: The training function to execute on each worker.
+            This function can either take in zero arguments or a single ``Dict``
+            argument which is set by defining ``train_loop_config``.
+            Within this function you can use any of the
+            :ref:`Ray Train Loop utilities <train-loop-api>`.
+        train_loop_config: A configuration ``Dict`` to pass in as an argument to
+            ``train_loop_per_worker``.
+            This is typically used for specifying hyperparameters.
+        lightgbm_config: The configuration for setting up the distributed lightgbm
+            backend. See :class:`~ray.train.lightgbm.LightGBMConfig` for more info.
+        scaling_config: The configuration for how to scale data parallel training.
+            ``num_workers`` determines how many Python processes are used for training,
+            and ``use_gpu`` determines whether or not each process should use GPUs.
+            See :class:`~ray.train.ScalingConfig` for more info.
+        run_config: The configuration for the execution of the training run.
+            See :class:`~ray.train.RunConfig` for more info.
+        datasets: The Ray Datasets to ingest for training.
+            Datasets are keyed by name (``{name: dataset}``).
+            Each dataset can be accessed from within the ``train_loop_per_worker``
+            by calling ``ray.train.get_dataset_shard(name)``.
+            Sharding and additional configuration can be done by
+            passing in a ``dataset_config``.
+        dataset_config: The configuration for ingesting the input ``datasets``.
+            By default, all the Ray Dataset are split equally across workers.
+            See :class:`~ray.train.DataConfig` for more details.
+        resume_from_checkpoint: A checkpoint to resume training from.
+            This checkpoint can be accessed from within ``train_loop_per_worker``
+            by calling ``ray.train.get_checkpoint()``.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        lightgbm_config: Optional[LightGBMConfig] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        dataset_config: Optional[ray.train.DataConfig] = None,
+        metadata: Optional[Dict[str, Any]] = _UNSUPPORTED,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        super(LightGBMTrainer, self).__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=lightgbm_config or LightGBMConfig(),
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )
+
+    @classmethod
+    def get_model(
+        cls,
+        checkpoint: Checkpoint,
+    ):
+        """Retrieve the LightGBM model stored in this checkpoint.
+
+        This API is deprecated. Use `RayTrainReportCallback.get_model` instead.
+        """
+        raise DeprecationWarning(
+            "`LightGBMTrainer.get_model` is deprecated. "
+            "Use `RayTrainReportCallback.get_model` instead."
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/xgboost_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/xgboost_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..44efc6b2dfb7abd1eea105dad6e2e07b0e8d094a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/xgboost_trainer.py
@@ -0,0 +1,154 @@
+import logging
+from typing import Any, Callable, Dict, Optional, Union
+
+import ray.train
+from ray.train import Checkpoint
+from ray.train.trainer import GenDataset
+from ray.train.v2._internal.constants import _UNSUPPORTED
+from ray.train.v2.api.config import RunConfig, ScalingConfig
+from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer
+from ray.train.xgboost import XGBoostConfig
+
+logger = logging.getLogger(__name__)
+
+
+class XGBoostTrainer(DataParallelTrainer):
+    """A Trainer for distributed data-parallel XGBoost training.
+
+    Example
+    -------
+
+    .. testcode::
+
+        import xgboost
+
+        import ray.data
+        import ray.train
+        from ray.train.xgboost import RayTrainReportCallback
+        from ray.train.xgboost import XGBoostTrainer
+
+        def train_fn_per_worker(config: dict):
+            # (Optional) Add logic to resume training state from a checkpoint.
+            # ray.train.get_checkpoint()
+
+            # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix`
+            train_ds_iter, eval_ds_iter = (
+                ray.train.get_dataset_shard("train"),
+                ray.train.get_dataset_shard("validation"),
+            )
+            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
+
+            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
+            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
+            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
+
+            dtrain = xgboost.DMatrix(train_X, label=train_y)
+            deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+            params = {
+                "tree_method": "approx",
+                "objective": "reg:squarederror",
+                "eta": 1e-4,
+                "subsample": 0.5,
+                "max_depth": 2,
+            }
+
+            # 2. Do distributed data-parallel training.
+            # Ray Train sets up the necessary coordinator processes and
+            # environment variables for your workers to communicate with each other.
+            bst = xgboost.train(
+                params,
+                dtrain=dtrain,
+                evals=[(deval, "validation")],
+                num_boost_round=10,
+                callbacks=[RayTrainReportCallback()],
+            )
+
+        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+        eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)])
+        trainer = XGBoostTrainer(
+            train_fn_per_worker,
+            datasets={"train": train_ds, "validation": eval_ds},
+            scaling_config=ray.train.ScalingConfig(num_workers=4),
+        )
+        result = trainer.fit()
+        booster = RayTrainReportCallback.get_model(result.checkpoint)
+
+    .. testoutput::
+        :hide:
+
+        ...
+
+    Args:
+        train_loop_per_worker: The training function to execute on each worker.
+            This function can either take in zero arguments or a single ``Dict``
+            argument which is set by defining ``train_loop_config``.
+            Within this function you can use any of the
+            :ref:`Ray Train Loop utilities <train-loop-api>`.
+        train_loop_config: A configuration ``Dict`` to pass in as an argument to
+            ``train_loop_per_worker``.
+            This is typically used for specifying hyperparameters.
+        xgboost_config: The configuration for setting up the distributed xgboost
+            backend. Defaults to using the "rabit" backend.
+            See :class:`~ray.train.xgboost.XGBoostConfig` for more info.
+        scaling_config: The configuration for how to scale data parallel training.
+            ``num_workers`` determines how many Python processes are used for training,
+            and ``use_gpu`` determines whether or not each process should use GPUs.
+            See :class:`~ray.train.ScalingConfig` for more info.
+        run_config: The configuration for the execution of the training run.
+            See :class:`~ray.train.RunConfig` for more info.
+        datasets: The Ray Datasets to ingest for training.
+            Datasets are keyed by name (``{name: dataset}``).
+            Each dataset can be accessed from within the ``train_loop_per_worker``
+            by calling ``ray.train.get_dataset_shard(name)``.
+            Sharding and additional configuration can be done by
+            passing in a ``dataset_config``.
+        dataset_config: The configuration for ingesting the input ``datasets``.
+            By default, all the Ray Dataset are split equally across workers.
+            See :class:`~ray.train.DataConfig` for more details.
+        resume_from_checkpoint: A checkpoint to resume training from.
+            This checkpoint can be accessed from within ``train_loop_per_worker``
+            by calling ``ray.train.get_checkpoint()``.
+        metadata: Dict that should be made available via
+            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+            for checkpoints saved from this Trainer. Must be JSON-serializable.
+    """
+
+    def __init__(
+        self,
+        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
+        *,
+        train_loop_config: Optional[Dict] = None,
+        xgboost_config: Optional[XGBoostConfig] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        run_config: Optional[RunConfig] = None,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        dataset_config: Optional[ray.train.DataConfig] = None,
+        metadata: Optional[Dict[str, Any]] = _UNSUPPORTED,
+        resume_from_checkpoint: Optional[Checkpoint] = None,
+    ):
+        super(XGBoostTrainer, self).__init__(
+            train_loop_per_worker=train_loop_per_worker,
+            train_loop_config=train_loop_config,
+            backend_config=xgboost_config or XGBoostConfig(),
+            scaling_config=scaling_config,
+            dataset_config=dataset_config,
+            run_config=run_config,
+            datasets=datasets,
+            resume_from_checkpoint=resume_from_checkpoint,
+            metadata=metadata,
+        )
+
+    @classmethod
+    def get_model(
+        cls,
+        checkpoint: Checkpoint,
+    ):
+        """Retrieve the XGBoost model stored in this checkpoint.
+
+        This API is deprecated. Use `RayTrainReportCallback.get_model` instead.
+        """
+        raise DeprecationWarning(
+            "`XGBoostTrainer.get_model` is deprecated. "
+            "Use `RayTrainReportCallback.get_model` instead."
+        )