diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd360bbbd9014c6a3725a7bc085bd9d870dfa544 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8239f824d784f69fda7d5a332750598d90befe31 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/backend.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1463a8c55eda528286615c67325fa82dd8f0aecd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/base_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b35644e15d8d874cc0393d4bc42902ea9606c90 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/constants.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29f1d86f8f67ed628149614cffb9e7522a6eb3dc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/context.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b32970b7e438613b2ad8c983c8c14f8b18e2470 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/data_parallel_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..170258dbcfc06e7e14b1825723e14c08e1faf95e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/error.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9786f430329549d9fd807a84b3197f1d6f520b3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75e9c11e701f09bcec7369446c722b0fefb48159 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/session.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c542edf29626491786a1b2e29e59d9bb85d56746 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8306b67685612dc1585cc592646ac9eed2a70343 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c9d7ff7993c9e50477ba2b62c1b40b8fb717bff Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/accelerator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f591ad7a5479bb1c814035f69cc927a14ed25b2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/backend_executor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7c622f384ade5cfd4be3fde19dcf593463c21e1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/checkpoint_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1eedc278c3115280a695c0aaac366b71970b1a47 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/data_config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3fe351c87c8c09802a3e8bc0aa268ee9c06a2b0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/dl_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf6f8d984ad927871db737a4dfe5b559ccf3322a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/framework_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70c0add20ebf5e38c2553ffc53a3cf6313f9b1fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/session.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..792797bbc8108698e9e49270a502c2118f279e98 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/storage.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ee8257dd94b3dc28faef502583476594a64e580 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/syncer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cd18f1a44f5ed77bcf7e14043100077aa00c72b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b8290e4aba5ff4f50d02b618e5c4447e97670cc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/__pycache__/worker_group.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py new file mode 100644 index 0000000000000000000000000000000000000000..1c31098d86defc704ddef75172543813361119da --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/accelerator.py @@ -0,0 +1,5 @@ +import abc + + +class Accelerator(abc.ABC): + """A utility that contains methods to accelerate training.""" diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..3815f31add4097ce2e64f1c7e03f6385bb38ee7a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/backend_executor.py @@ -0,0 +1,830 @@ +import logging +import os +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar + +import ray +import ray._private.ray_constants as ray_constants +from ray._private.ray_constants import env_integer +from ray.data import Dataset +from ray.exceptions import RayActorError +from ray.train import Checkpoint, DataConfig +from ray.train._internal.session import ( + TrialInfo, + _TrainingResult, + get_session, + init_session, + shutdown_session, +) +from ray.train._internal.storage import StorageContext +from ray.train._internal.utils import check_for_failure +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import BackendConfig +from ray.train.constants import ( + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, + ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, + ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, + ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, + ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV, + RAY_TRAIN_ENABLE_STATE_TRACKING, + TRAIN_ENABLE_WORKER_SPREAD_ENV, + TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, +) +from ray.util.placement_group import get_current_placement_group, remove_placement_group + +T = TypeVar("T") + +logger = logging.getLogger(__name__) + + +class TrainBackendError(Exception): + """Errors with BackendExecutor that should not be exposed to user.""" + + +class TrainingWorkerError(Exception): + """Raised if a worker fails during training.""" + + +@dataclass +class ResourceConfig: + """ + Resource configuration for resource_ids to share between workers. + + Args: + resource_name: The name of the resource to configure + (Example: "neuron_cores" or "gpu"). + resource_enable_sharing_env_var: The environment variable to + check if the resource should be shared. + share_resource_ids_env_var: The environment variable to configure for + sharing the resources with other workers. + """ + + resource_name: str + resource_enable_sharing_env_var: str + share_resource_ids_env_var: str + + +class BackendExecutor: + """Main execution class for training backends. + + This class holds a worker group and is responsible for executing the + training function on the workers, and collecting intermediate results + from ``session.report()``. + + Args: + backend_config: The configurations for this + specific backend. + num_workers: Number of workers to use for training. + resources_per_worker (Optional[Dict[str, float]]): + Dictionary specifying the resources that will be + requested for each worker. Defaults to {"CPU": 1}. + max_retries: Number of retries when Ray actors fail. + Defaults to 3. Set to -1 for unlimited retries. + """ + + def __init__( + self, + backend_config: BackendConfig, + # TODO(xwjiang): Legacy Ray Train trainer clean up! + trial_info: Optional[TrialInfo] = None, + num_workers: int = 1, + resources_per_worker: Optional[Dict[str, float]] = None, + max_retries: int = 3, + ): + if resources_per_worker is None: + self._resources_per_worker = {"CPU": 1} + else: + self._resources_per_worker = resources_per_worker.copy() + + self._backend_config = backend_config + self._backend = backend_config.backend_cls() + self._num_workers = num_workers + self._max_failures = max_retries + if self._max_failures < 0: + self._max_failures = float("inf") + self._num_failures = 0 + self._last_failure = None + self._initialization_hook = None + self._placement_group = None + + self._trial_info = trial_info + + self.worker_group = InactiveWorkerGroup() + self.dataset_shards = None + + self._resource_configs = [ + ResourceConfig( + ray_constants.NEURON_CORES, + ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, + ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR, + ), + ResourceConfig( + ray_constants.NPU, + ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, + ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, + ), + # For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var. + ResourceConfig( + ray_constants.GPU, + ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV, + ray_constants.ROCR_VISIBLE_DEVICES_ENV_VAR, + ), + ] + + # Record the initialization time of BackendExecutor, which is + # after trainer.fit() and before worker_group executes the training function. + self._start_time_ms = int(time.time() * 1000) + + self.state_tracking_enabled = env_integer(RAY_TRAIN_ENABLE_STATE_TRACKING, 0) + + def start( + self, + initialization_hook: Optional[Callable[[], None]] = None, + train_cls: Optional[Type] = None, + train_cls_args: Optional[Tuple] = None, + train_cls_kwargs: Optional[Dict] = None, + ): + """Starts the worker group.""" + self._create_placement_group() + placement_group = self._placement_group or "default" + self.worker_group = WorkerGroup( + num_workers=self._num_workers, + resources_per_worker=self._resources_per_worker, + actor_cls=train_cls, + actor_cls_args=train_cls_args, + actor_cls_kwargs=train_cls_kwargs, + placement_group=placement_group, + ) + # Hack to avoid OOMs. + # This is just a temporary solution for Train loading entire checkpoints + # into memory by ensuring that the rank 0 worker is on the same node as + # trainable, thus allowing for lazy checkpoint transfer to be used. + # See https://github.com/ray-project/ray/issues/33073 + # for more context. + # TODO remove passing in trial_driver_ip. + + trial_driver_node_id = ( + self._trial_info.driver_node_id if self._trial_info else None + ) + self.worker_group.sort_workers_by_node_id_and_gpu_id(trial_driver_node_id) + + try: + if initialization_hook: + self._initialization_hook = initialization_hook + self.worker_group.execute(initialization_hook) + + # Always propagate the driver's DataContext to each worker in the group. + from ray.data import DataContext + + def _set_driver_dataset_context(ctx: DataContext): + DataContext._set_current(ctx) + + self.worker_group.execute( + _set_driver_dataset_context, + DataContext.get_current(), + ) + + share_cuda_visible_devices_enabled = bool( + env_integer( + ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, + self._backend.share_cuda_visible_devices, + ) + ) + + if ( + self._resources_per_worker.get("GPU", 0) > 0 + and share_cuda_visible_devices_enabled + ): + self._share_cuda_visible_devices() + for resource_config in self._resource_configs: + if self._is_share_resources_enabled( + resource_config.resource_name, + resource_config.resource_enable_sharing_env_var, + ): + self._share_resource_ids( + resource_config.resource_name, + resource_config.share_resource_ids_env_var, + ) + self._backend.on_start(self.worker_group, self._backend_config) + except RayActorError as exc: + logger.exception(str(exc)) + logger.warning( + "Failure occurred during startup. Restarting all workers and " + "attempting to startup again." + ) + self._increment_failures() + self._restart() + + if self.state_tracking_enabled: + from ray.train._internal.state import TrainRunStateManager + from ray.train._internal.state.state_actor import get_state_actor + + self.state_manager = TrainRunStateManager(state_actor=get_state_actor()) + + def _create_placement_group(self): + """Creates a placement group if it does not exist. + + If a placement group is already detected (Tune) this will be a no-op. + + By default the placement group will be created with PACK strategy. + This is optimized for colocating GPUs on a minimal number of nodes. + This behavior can be overridden to use the SPREAD strategy by defining + ``TRAIN_ENABLE_WORKER_SPREAD_ENV`` + + If a placement group is created it will be stored as + self._placement_group. + """ + current_placement_group = get_current_placement_group() + worker = ray._private.worker.global_worker + should_capture_child_tasks_in_placement_group = ( + worker.should_capture_child_tasks_in_placement_group + ) + should_create_placement_group = ( + current_placement_group is None + or not should_capture_child_tasks_in_placement_group + ) + + if should_create_placement_group: + bundles = [ + self._resources_per_worker.copy() for _ in range(self._num_workers) + ] + + use_spread = bool(env_integer(TRAIN_ENABLE_WORKER_SPREAD_ENV, 0)) + strategy = "SPREAD" if use_spread else "PACK" + + placement_group = ray.util.placement_group(bundles, strategy=strategy) + logger.debug("Waiting for placement group to start.") + timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100) + ready, _ = ray.wait([placement_group.ready()], timeout=timeout) + if ready: + logger.debug("Placement group has started.") + else: + raise TimeoutError( + "Placement group creation timed out. Make sure your " + "cluster either has enough resources or use an " + "autoscaling cluster. If you are running on a cluster, " + "make sure you specify an address in `ray.init()`, for example, " + '`ray.init("auto")`. You can also increase the timeout by setting ' + "the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. " + "Current resources available: {}, resources requested by the " + "placement group: {}".format( + ray.available_resources(), placement_group.bundle_specs + ) + ) + self._placement_group = placement_group + + def _share_cuda_visible_devices(self): + """Sets CUDA_VISIBLE_DEVICES on all workers. + + For each worker, CUDA_VISIBLE_DEVICES will be set to the GPU IDs + visible to all workers on that worker's node. + + This allows GPU workers on the same node to communicate with one + another. + + Example: + + Setup: + - Node1: + - Worker1: {0, 1} + - Worker2: {2, 3} + - Node2: + - Worker3: {0, 1} + + CUDA_VISIBLE_DEVICES: + - Worker1: "0,1,2,3" + - Worker2: "0,1,2,3" + - Worker3: "0,1" + + """ + self._share_resource_ids( + ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR + ) + + def _share_resource_ids(self, resource: str, env_var: str): + """Sets the given env_var on all workers. + + For each worker, the cores/devices are visible to all the + workers on that worker's node.This allows workers on the + same node to communicate with one another. + + Example: + + Setup: + - Node1: + - Worker1: {0, 1} + - Worker2: {2, 3} + - Node2: + - Worker3: {0, 1} + + NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/...: + - Worker1: "0,1,2,3" + - Worker2: "0,1,2,3" + - Worker2: "0,1" + + Args: + resource: The name of the resource/accelerator. + env_var: The name of the environment variable to set. + """ + node_ids_and_resource_ids = [ + ( + w.metadata.node_id, + w.metadata.resource_ids[resource], + ) + for w in self.worker_group.workers + ] + node_id_to_worker_id = defaultdict(set) + node_id_to_resource_ids = defaultdict(set) + + for worker_id, (node_id, resource_ids) in enumerate(node_ids_and_resource_ids): + node_id_to_worker_id[node_id].add(worker_id) + node_id_to_resource_ids[node_id].update(resource_ids) + + futures = [] + for node_id, resource_ids in node_id_to_resource_ids.items(): + resource_ids = sorted(resource_ids) + all_resource_ids = ",".join(resource_ids) + + def set_resource_ids(): + os.environ[env_var] = all_resource_ids + + for worker_id in node_id_to_worker_id[node_id]: + futures.append( + self.worker_group.execute_single_async(worker_id, set_resource_ids) + ) + ray.get(futures) + + def _is_share_resources_enabled(self, resource_name: str, enable_sharing_env: str): + """Whether to share resource IDs on all workers + based on enable_sharing_env. + + This will return true if resources are requested and greater than 0. + Also, user can disable by configuring the `enable_sharing_env` to "0". + + Args: + resource_name: The name of the resource/accelerator. + enable_sharing_env: The name of the environment variable + to check. + """ + has_resource_requested = self._resources_per_worker.get(resource_name, 0) > 0 + return has_resource_requested and ray_constants.env_bool( + enable_sharing_env, True + ) + + def _create_rank_world_size_mappings(self) -> List[Dict]: + """Create rank and world size mappings for workers. + There are three maps returned: + - local_rank_map, which maps from worker world_rank to local_rank. + - local_world_size_map, which maps from world_rank to local_world_size + - node_rank_map, which maps from world rank to node rank + + Example: + Worker 0: node 0 + Worker 1: node 0 + Worker 2: node 1 + Worker 3: node 0 + Worker 4: node 1 + + Workers 0, 1, 3 are on node 0. + Workers 2, 4 are on node 1. + + Expected local_rank_map: + { + 0 -> 0, + 1 -> 1, + 2 -> 0, + 3 -> 2, + 4 -> 1 + } + + Expected local_world_size_map: + { + 0 -> 3, + 1 -> 3, + 2 -> 2, + 3 -> 3, + 4 -> 2 + } + + Expected node_rank_map: + { + 0 -> 0, + 1 -> 0, + 2 -> 1, + 3 -> 0, + 4 -> 1 + } + + """ + local_rank_map = {} # map from world rank to local rank + local_world_size_map = {} # map from world rank to local world size + node_rank_map = {} # map from world rank to node rank + node_ids = {} # map from node id to node index + node_cnt = 0 # count the number of nodes + + node_id_dict = defaultdict( + int + ) # map from node id to the number of workers on it. + for world_rank in range(len(self.worker_group)): + worker = self.worker_group.workers[world_rank] + node_id = worker.metadata.node_id + local_rank_map[world_rank] = node_id_dict[node_id] + node_id_dict[node_id] += 1 + + if node_id not in node_ids: + node_ids[node_id] = node_cnt + node_cnt += 1 + node_rank_map[world_rank] = node_ids[node_id] + + for world_rank in range(len(self.worker_group)): + worker = self.worker_group.workers[world_rank] + node_id = worker.metadata.node_id + local_world_size_map[world_rank] = node_id_dict[node_id] + + workers_info = "\n".join( + [ + f"- (node_id={w.metadata.node_id}, ip={w.metadata.node_ip}, " + f"pid={w.metadata.pid}) world_rank={i}, " + f"local_rank={local_rank_map[i]}, node_rank={node_rank_map[i]}" + for i, w in enumerate(self.worker_group.workers) + ] + ) + logger.info(f"Started distributed worker processes: \n{workers_info}") + + return local_rank_map, local_world_size_map, node_rank_map + + def start_training( + self, + train_func: Callable[[], T], + datasets: Dict[str, Dataset], + metadata: Dict[str, Any], + data_config: DataConfig, + storage: StorageContext, + checkpoint: Optional[Checkpoint] = None, + ) -> None: + """Executes a training function on all workers in a separate thread. + + ``finish_training`` should be called after this. + + Args: + train_func: The training function to run on each worker. + datasets: The base datasets. + data_config: The config object for creating dataset shards for workers. + checkpoint: The checkpoint data that + should be loaded onto each worker and accessed by the + training function via ``session.get_checkpoint()``. If this + is ``None`` then no checkpoint will be loaded. + """ + use_detailed_autofilled_metrics = env_integer( + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0 + ) + + # First initialize the session. + def initialize_session( + train_func, + world_rank, + local_rank, + node_rank, + local_world_size, + world_size, + trial_info, + checkpoint, + dataset_shard, + metadata, + storage, + ): + try: + init_session( + training_func=train_func, + world_rank=world_rank, + local_rank=local_rank, + node_rank=node_rank, + local_world_size=local_world_size, + world_size=world_size, + trial_info=trial_info, + dataset_shard=dataset_shard, + metadata=metadata, + checkpoint=checkpoint, + detailed_autofilled_metrics=use_detailed_autofilled_metrics, + storage=storage, + ) + except ValueError: + raise TrainBackendError( + "Attempting to start training but a " + "previous training run is still ongoing. " + "You must call `finish_training` before " + "calling `start_training` again." + ) + + if self.dataset_shards is None: + actors = [worker.actor for worker in self.worker_group.workers] + node_ids = [worker.metadata.node_id for worker in self.worker_group.workers] + self.dataset_shards = data_config.configure( + datasets, + world_size=len(self.worker_group), + worker_handles=actors, + worker_node_ids=node_ids, + ) + + ( + local_rank_map, + local_world_size_map, + node_rank_map, + ) = self._create_rank_world_size_mappings() + + futures = [] + for index in range(len(self.worker_group)): + futures.append( + self.worker_group.execute_single_async( + index, + initialize_session, + world_rank=index, + local_rank=local_rank_map[index], + node_rank=node_rank_map[index], + local_world_size=local_world_size_map[index], + world_size=len(self.worker_group), + trial_info=self._trial_info, + train_func=train_func, + dataset_shard=self.dataset_shards[index], + metadata=metadata, + checkpoint=checkpoint, + storage=storage, + ) + ) + + self._backend.on_training_start(self.worker_group, self._backend_config) + + self.get_with_failure_handling(futures) + + # Register Train Run before training starts + if self.state_tracking_enabled: + from ray.train._internal.state.schema import RunStatusEnum + + core_context = ray.runtime_context.get_runtime_context() + + self.state_manager.register_train_run( + run_id=self._trial_info.run_id, + run_name=self._trial_info.experiment_name, + job_id=core_context.get_job_id(), + controller_actor_id=core_context.get_actor_id(), + datasets=datasets, + worker_group=self.worker_group, + start_time_ms=self._start_time_ms, + run_status=RunStatusEnum.RUNNING, + ) + + # Run the training function asynchronously in its own thread. + def train_async(): + session = get_session() + session.start() + + self.worker_group.execute_async(train_async) + + def get_next_results(self) -> Optional[List[_TrainingResult]]: + """Fetches the next ``_TrainingResult`` from each worker. + + Each ``_TrainingResult`` is expected to correspond to the same step from + each worker (e.g. the same call to ``train.report()``). + + Returns: + A list of ``_TrainingResult``s or ``None`` if there are no more results + since the training function has exited on all workers. + """ + + def get_next(): + session = _get_session("get_next_results") + try: + result = session.get_next() + except RuntimeError: + # Training thread has not been started yet. + raise TrainBackendError( + "`get_next_results` has been called " + "before `start_training`. Please call " + "`start_training` before " + "`get_next_results`." + ) + + return result + + # Get next result from each worker. + futures = self.worker_group.execute_async(get_next) + results = self.get_with_failure_handling(futures) + + # Check if any worker returned None. + if any(r is None for r in results): + # Either all workers have results or none of them do. + if not all(r is None for r in results): + raise RuntimeError( + "Some workers returned results while " + "others didn't. Make sure that " + "`session.report()` are called the " + "same number of times on all workers." + ) + else: + # Return None if all results are None. + return None + + return results + + def pause_reporting(self): + """Disable workers from enqueuing results from ``session.report()``. + + Note: Already reported results may still be enqueued at this point, + and should be handled appropriately. + """ + + def pause_session_reporting(): + session = _get_session("pause_reporting") + return session.pause_reporting() + + futures = self.worker_group.execute_async(pause_session_reporting) + self.get_with_failure_handling(futures) + + def finish_training(self): + """Finish training and return final results. Propagate any exceptions. + + Blocks until training is finished on all workers. + + Assumes `start_training` has already been called. + + Returns: + A list of return values from calling ``train_func`` on each worker. + Each item corresponds to the return value from a single worker. + """ + + def end_training(): + session = _get_session("finish_training") + try: + # session.finish raises any Exceptions from training. + output = session.finish() + finally: + # Shutdown session even if session.finish() raises an + # Exception. + shutdown_session() + + return output + + futures = self.worker_group.execute_async(end_training) + results = self.get_with_failure_handling(futures) + return results + + def report_final_run_status( + self, + errored: bool = False, + failed_rank: Optional[int] = None, + stack_trace: Optional[str] = None, + ): + """Report the final train run status, error, and end time to TrainStateActor.""" + if self.state_tracking_enabled: + from ray.train._internal.state.schema import ( + MAX_ERROR_STACK_TRACE_LENGTH, + RunStatusEnum, + ) + + if errored: + run_status = RunStatusEnum.ERRORED + status_detail = "" + if failed_rank is not None: + status_detail += f"Rank {failed_rank} worker raised an error. \n" + if stack_trace is not None: + # Keep only the last part of the stack trace if it's too long. + status_detail += stack_trace[-MAX_ERROR_STACK_TRACE_LENGTH:] + else: + run_status = RunStatusEnum.FINISHED + status_detail = "" + + self.state_manager.end_train_run( + run_id=self._trial_info.run_id, + run_status=run_status, + status_detail=status_detail, + end_time_ms=int(time.time() * 1000), + ) + + def get_with_failure_handling(self, remote_values): + """Gets the remote values while handling for worker failures. + + This method should be called instead of ``ray.get()`` directly in + order to handle worker failures. + + If a worker failure is identified, backend specific failure handling + is executed and a ``TrainingWorkerError`` is raised. + + Args: + remote_values: List of object refs representing functions + that may fail in the middle of execution. For example, running + a Train training loop in multiple parallel actor calls. + Returns: + The resolved objects represented by the passed in ObjectRefs. + """ + success, exception = check_for_failure(remote_values) + if success: + return ray.get(remote_values) + else: + self._last_failure = exception + self._increment_failures() + logger.warning( + "Failure identified during training. Restarting all workers and " + "continuing training from latest checkpoint." + ) + self._restart() + raise TrainingWorkerError + + def shutdown(self, graceful_termination: bool = True): + """Shuts down the workers in the worker group. + + Args: + graceful_termination: If set to True, attempt to clean up the backend + before terminating the Ray actors. + + """ + if graceful_termination: + try: + self._backend.on_shutdown(self.worker_group, self._backend_config) + except RayActorError: + logger.warning( + "Graceful shutdown of backend failed. This is " + "expected if one of the workers has crashed." + ) + + if graceful_termination: + self.worker_group.shutdown() + else: + self.worker_group.shutdown(patience_s=0) + self.worker_group = InactiveWorkerGroup() + + if self._placement_group: + remove_placement_group(self._placement_group) + self._placement_group = None + + self.dataset_shards = None + + def is_started(self): + return not isinstance(self.worker_group, InactiveWorkerGroup) + + def _restart(self): + self.worker_group.shutdown() + if self._initialization_hook is not None: + initialization_hook = self._initialization_hook + else: + initialization_hook = None + if self._placement_group: + remove_placement_group(self._placement_group) + self._placement_group = None + self.start(initialization_hook=initialization_hook) + + def _increment_failures(self): + self._num_failures += 1 + if self._num_failures >= self._max_failures: + failure = self._last_failure + self._last_failure = None + if self._max_failures > 0: + exc = RuntimeError( + "Training has failed after " f"{self._num_failures} " "attempts." + ) + raise exc.with_traceback(None) from failure + else: + raise failure + + def get_worker_group(self): + return self.worker_group + + def _get_num_failures(self): + return self._num_failures + + +class InactiveWorkerGroupError(Exception): + """Raised when underlying worker group is inactive.""" + + +class InactiveWorkerGroup: + # TODO: fix inheritence. perhaps create WorkerGroupInterface. + + # Need to define getstate and setstate so that getattr does not screwup + # pickling. See https://stackoverflow.com/a/50888571/11249691 + def __getstate__(self): + return vars(self) + + def __setstate__(self, state): + vars(self).update(state) + + def __getattr__(self, name): + raise InactiveWorkerGroupError() + + def __len__(self): + raise InactiveWorkerGroupError() + + +def _get_session(method_name: str): + # Get the session for this worker. + session = get_session() + if not session: + # Session is not initialized yet. + raise TrainBackendError( + f"`{method_name}` has been called " + "before `start_training`. Please call " + "`start_training` before " + f"`{method_name}`." + ) + return session diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..6a0eb55275246b5dcc7cff0c6fda58f2318b5076 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/checkpoint_manager.py @@ -0,0 +1,185 @@ +import logging +import numbers +from typing import Any, Callable, List, Optional, Tuple + +from ray._private.dict import flatten_dict +from ray.air._internal.util import is_nan +from ray.air.config import MAX +from ray.train import CheckpointConfig +from ray.train._internal.session import _TrainingResult +from ray.train._internal.storage import _delete_fs_path + +logger = logging.getLogger(__name__) + + +def _insert_into_sorted_list(list: List[Any], item: Any, key: Callable[[Any], Any]): + """Insert an item into a sorted list with a custom key function. + + Examples: + + >>> list = [] + >>> _insert_into_sorted_list(list, {"a": 1, "b": 0}, lambda x: x["a"]) + >>> list + [{'a': 1, 'b': 0}] + >>> _insert_into_sorted_list(list, {"a": 3, "b": 1}, lambda x: x["a"]) + >>> list + [{'a': 1, 'b': 0}, {'a': 3, 'b': 1}] + >>> _insert_into_sorted_list(list, {"a": 4, "b": 2}, lambda x: x["a"]) + >>> list + [{'a': 1, 'b': 0}, {'a': 3, 'b': 1}, {'a': 4, 'b': 2}] + >>> _insert_into_sorted_list(list, {"a": 1, "b": 3}, lambda x: x["a"]) + >>> list + [{'a': 1, 'b': 0}, {'a': 1, 'b': 3}, {'a': 3, 'b': 1}, {'a': 4, 'b': 2}] + """ + i = 0 + while i < len(list): + # Insert to the right of all duplicates. + if key(list[i]) > key(item): + break + i += 1 + list.insert(i, item) + + +class _CheckpointManager: + """Checkpoint manager that handles checkpoint book-keeping for a trial. + + The main purpose of this abstraction is to keep the top K checkpoints based on + recency/a user-provided metric. + + NOTE: This class interacts with `_TrainingResult` objects, which are + (checkpoint, metrics) pairs. This is to order checkpoints by metrics. + + Args: + checkpoint_config: Defines how many and which checkpoints to keep. + """ + + def __init__(self, checkpoint_config: Optional[CheckpointConfig]): + self._checkpoint_config = checkpoint_config or CheckpointConfig() + + # List of checkpoints ordered by ascending score. + self._checkpoint_results: List[_TrainingResult] = [] + + # The latest registered checkpoint. + # This should never be immediately deleted upon registration, + # even if it's not in the top K checkpoints, based on score. + self._latest_checkpoint_result: Optional[_TrainingResult] = None + + if ( + self._checkpoint_config.num_to_keep is not None + and self._checkpoint_config.num_to_keep <= 0 + ): + raise ValueError( + f"`num_to_keep` must >= 1, got: " + f"{self._checkpoint_config.num_to_keep}" + ) + + @property + def checkpoint_config(self): + return self._checkpoint_config + + def register_checkpoint(self, checkpoint_result: _TrainingResult): + """Register new checkpoint and add to bookkeeping. + + This method will register a new checkpoint and add it to the internal + bookkeeping logic. This means the checkpoint manager will decide if + this checkpoint should be kept, and if older or worse performing + checkpoints should be deleted. + + Args: + checkpoint: Tracked checkpoint object to add to bookkeeping. + """ + self._latest_checkpoint_result = checkpoint_result + + if self._checkpoint_config.checkpoint_score_attribute is not None: + # If we're ordering by a score, insert the checkpoint + # so that the list remains sorted. + _insert_into_sorted_list( + self._checkpoint_results, + checkpoint_result, + key=self._get_checkpoint_score, + ) + else: + # If no metric is provided, just append (ordering by time of registration). + self._checkpoint_results.append(checkpoint_result) + + if self._checkpoint_config.num_to_keep is not None: + # Delete the bottom (N - K) checkpoints + worst_results = set( + self._checkpoint_results[: -self._checkpoint_config.num_to_keep] + ) + # Except for the latest checkpoint. + results_to_delete = worst_results - {self._latest_checkpoint_result} + + # Update internal state before actually deleting them. + self._checkpoint_results = [ + checkpoint_result + for checkpoint_result in self._checkpoint_results + if checkpoint_result not in results_to_delete + ] + + for checkpoint_result in results_to_delete: + checkpoint = checkpoint_result.checkpoint + logger.debug("Deleting checkpoint: ", checkpoint) + _delete_fs_path(fs=checkpoint.filesystem, fs_path=checkpoint.path) + + def _get_checkpoint_score( + self, checkpoint: _TrainingResult + ) -> Tuple[bool, numbers.Number]: + """Get the score for a checkpoint, according to checkpoint config. + + If `mode="min"`, the metric is negated so that the lowest score is + treated as the best. + + Returns: + Tuple: A tuple of (not_is_nan: bool, score: numbers.Number). + This score orders: nan values < float("-inf") < valid numeric metrics + """ + checkpoint_score_attribute = self._checkpoint_config.checkpoint_score_attribute + if checkpoint_score_attribute: + flat_metrics = flatten_dict(checkpoint.metrics) + try: + checkpoint_result = flat_metrics[checkpoint_score_attribute] + except KeyError: + valid_keys = list(flat_metrics.keys()) + logger.error( + f"Result dict has no key: {checkpoint_score_attribute}. " + f"checkpoint_score_attr must be set to a key in the " + f"result dict. Valid keys are: {valid_keys}" + ) + checkpoint_result = float("-inf") + else: + checkpoint_result = float("-inf") + + checkpoint_score_order = self._checkpoint_config.checkpoint_score_order + order_factor = 1.0 if checkpoint_score_order == MAX else -1.0 + + checkpoint_score = order_factor * checkpoint_result + + if not isinstance(checkpoint_score, numbers.Number): + raise ValueError( + f"Unable to persist checkpoint for " + f"checkpoint_score_attribute: " + f"{checkpoint_score_attribute} with value " + f"{checkpoint_score}. " + f"This attribute must be numerical." + ) + + return ( + (not is_nan(checkpoint_score), checkpoint_score) + if not is_nan(checkpoint_score) + else (False, float("-inf")) + ) + + @property + def best_checkpoint_result(self) -> Optional[_TrainingResult]: + return self._checkpoint_results[-1] if self._checkpoint_results else None + + @property + def latest_checkpoint_result(self) -> Optional[_TrainingResult]: + return self._latest_checkpoint_result + + @property + def best_checkpoint_results(self) -> List[_TrainingResult]: + if self._checkpoint_config.num_to_keep is None: + return self._checkpoint_results + return self._checkpoint_results[-self._checkpoint_config.num_to_keep :] diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py new file mode 100644 index 0000000000000000000000000000000000000000..13ec9addb3bf2027e49b8cf08b49e4a7b40d5451 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/data_config.py @@ -0,0 +1,139 @@ +import copy +from typing import Dict, List, Literal, Optional, Union + +import ray +from ray.actor import ActorHandle +from ray.data import DataIterator, Dataset, ExecutionOptions, NodeIdStr +from ray.data._internal.execution.interfaces.execution_options import ExecutionResources +from ray.util.annotations import DeveloperAPI, PublicAPI + + +@PublicAPI(stability="stable") +class DataConfig: + """Class responsible for configuring Train dataset preprocessing. + + For advanced use cases, this class can be subclassed and the `configure()` method + overriden for custom data preprocessing. + """ + + def __init__( + self, + datasets_to_split: Union[Literal["all"], List[str]] = "all", + execution_options: Optional[ExecutionOptions] = None, + ): + """Construct a DataConfig. + + Args: + datasets_to_split: Specifies which datasets should be split among workers. + Can be set to "all" or a list of dataset names. Defaults to "all", + i.e. split all datasets. + execution_options: The execution options to pass to Ray Data. By default, + the options will be optimized for data ingest. When overriding this, + base your options off of `DataConfig.default_ingest_options()`. + """ + if isinstance(datasets_to_split, list) or datasets_to_split == "all": + self._datasets_to_split = datasets_to_split + else: + raise TypeError( + "`datasets_to_split` should be a 'all' or a list of strings of " + "dataset names. Received " + f"{type(datasets_to_split).__name__} with value {datasets_to_split}." + ) + + self._execution_options: ExecutionOptions = ( + execution_options or DataConfig.default_ingest_options() + ) + + self._num_train_cpus = 0.0 + self._num_train_gpus = 0.0 + + def set_train_total_resources(self, num_train_cpus: float, num_train_gpus: float): + """Set the total number of CPUs and GPUs used by training. + + If CPU or GPU resource limits are not set, they will be set to the + total cluster resources minus the resources used by training. + """ + # TODO: We may also include other resources besides CPU and GPU. + self._num_train_cpus = num_train_cpus + self._num_train_gpus = num_train_gpus + + @DeveloperAPI + def configure( + self, + datasets: Dict[str, Dataset], + world_size: int, + worker_handles: Optional[List[ActorHandle]], + worker_node_ids: Optional[List[NodeIdStr]], + **kwargs, + ) -> List[Dict[str, DataIterator]]: + """Configure how Train datasets should be assigned to workers. + + Args: + datasets: The datasets dict passed to Train by the user. + world_size: The number of Train workers in total. + worker_handles: The actor handles of the Train workers. + worker_node_ids: The node ids of the Train workers. + kwargs: Forwards compatibility placeholder. + + Returns: + A list of dataset splits for each worker. The size of the list must be + equal to `world_size`. Each element of the list contains the assigned + `DataIterator` instances by name for the worker. + """ + output = [{} for _ in range(world_size)] + + if self._datasets_to_split == "all": + datasets_to_split = set(datasets.keys()) + else: + datasets_to_split = set(self._datasets_to_split) + + locality_hints = ( + worker_node_ids if self._execution_options.locality_with_output else None + ) + for name, ds in datasets.items(): + execution_options = copy.deepcopy(self._execution_options) + + if execution_options.is_resource_limits_default(): + # If "resource_limits" is not overriden by the user, + # add training-reserved resources to Data's exclude_resources. + execution_options.exclude_resources = ( + execution_options.exclude_resources.add( + ExecutionResources( + cpu=self._num_train_cpus, gpu=self._num_train_gpus + ) + ) + ) + + ds = ds.copy(ds) + ds.context.execution_options = execution_options + + if name in datasets_to_split: + for i, split in enumerate( + ds.streaming_split( + world_size, equal=True, locality_hints=locality_hints + ) + ): + output[i][name] = split + else: + for i in range(world_size): + output[i][name] = ds.iterator() + + return output + + @staticmethod + def default_ingest_options() -> ExecutionOptions: + """The default Ray Data options used for data ingest. + + By default, configurations are carried over from what is already set + in DataContext. + """ + ctx = ray.data.DataContext.get_current() + return ExecutionOptions( + # TODO(hchen): Re-enable `locality_with_output` by default after fixing + # https://github.com/ray-project/ray/issues/40607 + locality_with_output=ctx.execution_options.locality_with_output, + resource_limits=ctx.execution_options.resource_limits, + exclude_resources=ctx.execution_options.exclude_resources, + preserve_order=ctx.execution_options.preserve_order, + verbose_progress=ctx.execution_options.verbose_progress, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..a159cfe7b9e141f1d25cfdf160285505efeb80b8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/dl_predictor.py @@ -0,0 +1,103 @@ +import abc +from typing import Dict, Optional, TypeVar, Union + +import numpy as np +import pandas as pd + +from ray.air.util.data_batch_conversion import ( + BatchFormat, + _convert_batch_type_to_pandas, + _convert_pandas_to_batch_type, +) +from ray.train.predictor import Predictor +from ray.util.annotations import DeveloperAPI + +TensorType = TypeVar("TensorType") +TensorDtype = TypeVar("TensorDtype") + + +class DLPredictor(Predictor): + @abc.abstractmethod + def _arrays_to_tensors( + self, + numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]], + ) -> Union[TensorType, Dict[str, TensorType]]: + """Converts a NumPy ndarray batch to the tensor type for the DL framework. + + Args: + numpy_array: The numpy array to convert to a tensor. + dtype: The tensor dtype to use when creating the DL tensor. + ndarray: A (dict of) NumPy ndarray(s) that we wish to convert to a (dict of) + tensor(s). + dtype: A (dict of) tensor dtype(s) to use when creating the DL tensor; if + None, the dtype will be inferred from the NumPy ndarray data. + + Returns: + A deep learning framework specific tensor. + """ + raise NotImplementedError + + @abc.abstractmethod + def _tensor_to_array(self, tensor: TensorType) -> np.ndarray: + """Converts tensor framework specific tensor to a numpy array. + + Args: + tensor: A framework specific tensor. + + Returns: + A numpy array representing the input tensor. + """ + + raise NotImplementedError + + @abc.abstractmethod + @DeveloperAPI + def call_model( + self, inputs: Union[TensorType, Dict[str, TensorType]] + ) -> Union[TensorType, Dict[str, TensorType]]: + """Inputs the tensor to the model for this Predictor and returns the result. + + Args: + inputs: The tensor to input to the model. + + Returns: + A tensor or dictionary of tensors containing the model output. + """ + raise NotImplementedError + + @classmethod + @DeveloperAPI + def preferred_batch_format(cls) -> BatchFormat: + return BatchFormat.NUMPY + + def _predict_pandas( + self, + data: pd.DataFrame, + dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]], + ) -> pd.DataFrame: + numpy_input = _convert_pandas_to_batch_type( + data, + BatchFormat.NUMPY, + self._cast_tensor_columns, + ) + numpy_output = self._predict_numpy(numpy_input, dtype) + return _convert_batch_type_to_pandas(numpy_output) + + def _predict_numpy( + self, + data: Union[np.ndarray, Dict[str, np.ndarray]], + dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]], + ) -> Union[np.ndarray, Dict[str, np.ndarray]]: + # Single column selection return numpy array so preprocessors can be + # reused in both training and prediction + if isinstance(data, dict) and len(data) == 1: + data = next(iter(data.values())) + model_input = self._arrays_to_tensors(data, dtype) + model_output = self.call_model(model_input) + # TODO (jiaodong): Investigate perf implication of this. + # Move DL Tensor to CPU and convert to numpy. + if isinstance(model_output, dict): + return {k: self._tensor_to_array(v) for k, v in model_output.items()} + else: + return {"predictions": self._tensor_to_array(model_output)} diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..26259214d84cd1643ea40751cbc5b7ebdbf3d995 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/framework_checkpoint.py @@ -0,0 +1,45 @@ +from typing import Optional + +import ray.cloudpickle as ray_pickle +from ray._private.utils import binary_to_hex, hex_to_binary +from ray.data.preprocessor import Preprocessor +from ray.train._checkpoint import Checkpoint + +PREPROCESSOR_KEY = "preprocessor_pkl" + + +class FrameworkCheckpoint(Checkpoint): + """A checkpoint to preserve the functionality of legacy + framework-specific checkpoints. + + Example: + + >>> import tempfile + >>> checkpoint = FrameworkCheckpoint(tempfile.mkdtemp()) + >>> checkpoint.get_preprocessor() is None + True + >>> preprocessor = Preprocessor() + >>> preprocessor._attr = 1234 + >>> checkpoint.set_preprocessor(preprocessor) + >>> checkpoint.get_preprocessor()._attr + 1234 + """ + + def get_preprocessor(self) -> Optional[Preprocessor]: + """Return the preprocessor stored in the checkpoint. + + Returns: + The preprocessor stored in the checkpoint, or ``None`` if no + preprocessor was stored. + """ + metadata = self.get_metadata() + preprocessor_bytes = metadata.get(PREPROCESSOR_KEY) + if preprocessor_bytes is None: + return None + return ray_pickle.loads(hex_to_binary(preprocessor_bytes)) + + def set_preprocessor(self, preprocessor: Preprocessor): + """Store a preprocessor with the checkpoint.""" + self.update_metadata( + {PREPROCESSOR_KEY: binary_to_hex(ray_pickle.dumps(preprocessor))} + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/session.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/session.py new file mode 100644 index 0000000000000000000000000000000000000000..f142685caaab7f24f2fdb51938efa06eb9d6c4b1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/session.py @@ -0,0 +1,1163 @@ +import functools +import logging +import os +import platform +import queue +import sys +import threading +import time +import warnings +from dataclasses import dataclass +from datetime import datetime +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Set, Type + +import ray +from ray.air._internal.util import RunnerThread, StartTraceback +from ray.air.constants import ( + _ERROR_FETCH_TIMEOUT, + _RESULT_FETCH_TIMEOUT, + SESSION_MISUSE_LOG_ONCE_KEY, + TIME_THIS_ITER_S, + TIMESTAMP, +) +from ray.data import Dataset +from ray.train import Checkpoint +from ray.train._internal.accelerator import Accelerator +from ray.train._internal.storage import StorageContext +from ray.train.constants import ( + CHECKPOINT_DIR_NAME, + DETAILED_AUTOFILLED_KEYS, + RAY_CHDIR_TO_TRIAL_DIR, + TIME_TOTAL_S, + WORKER_HOSTNAME, + WORKER_NODE_IP, + WORKER_PID, + _v2_migration_warnings_enabled, +) +from ray.train.error import SessionMisuseError +from ray.train.utils import _log_deprecation_warning +from ray.util.annotations import DeveloperAPI, PublicAPI +from ray.util.debug import log_once +from ray.util.placement_group import _valid_resource_shape +from ray.util.scheduling_strategies import ( + PlacementGroupSchedulingStrategy, + SchedulingStrategyT, +) + +if TYPE_CHECKING: + from ray.data import DataIterator + from ray.tune.execution.placement_groups import PlacementGroupFactory + + +logger = logging.getLogger(__name__) + + +@dataclass +class TrialInfo: + """The trial information to propagate to TrainSession.""" + + name: str + id: str + resources: Dict[str, float] + logdir: str + driver_ip: str + driver_node_id: str + experiment_name: Optional[str] = None + run_id: Optional[str] = None + + +class _FutureTrainingResult: + """A future that will be resolved to a `_TrainingResult`. + + This is needed for specific schedulers such as PBT that schedule saves. + + This wrapper should be removed after refactoring PBT to not schedule saves anymore. + """ + + def __init__(self, future: ray.ObjectRef): + self.future = future + + def resolve(self, block: bool = True) -> Optional["_TrainingResult"]: + """Resolve into ``_TrainingResult``. + + This will return None for function trainables if no checkpoint has been + saved before. + """ + if block: + timeout = None + else: + timeout = 1e-9 + try: + return ray.get(self.future, timeout=timeout) + except TimeoutError: + # Not ready, yet + pass + except Exception as exc: + logger.error(f"Error resolving result: {exc}") + + +class _TrainingResult: + """A (checkpoint, metrics) result reported by the user.""" + + def __init__(self, checkpoint: Optional[Checkpoint], metrics: Dict[str, Any]): + self.checkpoint = checkpoint + self.metrics = metrics + + def __repr__(self) -> str: + return f"TrainingResult(checkpoint={self.checkpoint}, metrics={self.metrics})" + + +# TODO(xwjiang): This needs a better name. +@DeveloperAPI +class _TrainSession: + """Holds information for training on each worker.""" + + def __init__( + self, + training_func: Callable, + world_rank: Optional[int], + local_rank: Optional[int], + node_rank: Optional[int], + local_world_size: Optional[int], + world_size: Optional[int], + trial_info: Optional[TrialInfo] = None, + dataset_shard: Optional[Dict[str, Dataset]] = None, + metadata: Dict[str, Any] = None, + checkpoint: Optional[Checkpoint] = None, + detailed_autofilled_metrics: bool = False, + storage: Optional[StorageContext] = None, + synchronous_result_reporting: bool = False, + ): + # `synchronous_result_reporting` refers to whether or not the + # training function is immediately unblocked to continue running + # after the main thread receives its result. + # Ex 1: For 2 Ray Train workers with synchronous_result_reporting=True, + # the worker that produces a result first will immediately will continue + # onto the next iteration. + # Ex 2: For a Tune function Trainable with `synchronous_result_reporting=False`, + # training will only continue with an explicit call to `session.get_next`. + # Synchronous reporting in example 2 is needed for Tune schedulers to + # be able to stop the execution of the training function at will, + # for advanced pausing schedulers (PBT, BOHB) and actor reuse. + self.synchronous_result_reporting = synchronous_result_reporting + + # Ray Train worker properties + # Note: These are set to None for Tune function Trainables. + self.dataset_shard = dataset_shard + self.metadata = metadata + + self.world_rank = world_rank + self.local_rank = local_rank + self.node_rank = node_rank + self.local_world_size = local_world_size + self.world_size = world_size + + assert storage + logger.debug(f"StorageContext on SESSION (rank={world_rank}):\n{storage}") + + # NOTE: `reset` will initialize many properties needed to start running the + # training_func as a thread. + self.reset( + training_func=training_func, + trial_info=trial_info, + storage=storage, + loaded_checkpoint=checkpoint, + ) + + # Autofilled metrics attributes. + self.detailed_autofilled_metrics = detailed_autofilled_metrics + self.last_report_time = time.time() + self.iteration = 0 + self.time_total = 0.0 + self.local_ip = self.get_current_ip() + + self.accelerator = None + self._state = {} + + def get_state(self, key: str) -> Any: + return self._state.get(key) + + def set_state(self, key: str, value: Any): + self._state[key] = value + + def get_current_ip(self): + self.local_ip = ray.util.get_node_ip_address() + return self.local_ip + + def start(self): + """Starts the training thread.""" + self.training_started = True + self.training_thread.start() + + def reset( + self, + training_func: Callable, + trial_info: TrialInfo, + storage: StorageContext, + loaded_checkpoint=None, + ): + # This lock is used to control the execution of the training thread. + self.continue_lock = threading.Semaphore(0) + + # This event is used to signal the training thread to stop. + self.stop_event = threading.Event() + + # Queue for sending results across threads. + self.result_queue = queue.Queue(1) + + # Queue for raising exceptions from runner thread to main thread. + # The error queue has a max size of one to prevent stacking error and force + # error reporting to block until finished. + self.error_queue = queue.Queue(1) + + # The Thread object that is running the training function. + self.training_thread = RunnerThread( + target=training_func, daemon=True, error_queue=self.error_queue + ) + + # Possibly override with new state + self.trial_info = trial_info + self.storage = storage + self.loaded_checkpoint = loaded_checkpoint + + # Reset state + self._state = {} + self.ignore_report = False + self.training_started = False + self._first_report = True + + # Change the working directory to a special trial folder. + # This is to ensure that all Ray Train workers have a common working directory. + os.makedirs(storage.trial_working_directory, exist_ok=True) + if bool(int(os.environ.get(RAY_CHDIR_TO_TRIAL_DIR, "1"))): + logger.debug( + f"Changing the working directory to: {storage.trial_working_directory}" + ) + os.chdir(storage.trial_working_directory) + + def pause_reporting(self): + """Ignore all future ``session.report()`` calls.""" + self.ignore_report = True + + def finish(self, timeout: Optional[float] = None) -> Optional[Any]: + """Finishes the training thread. + + Raises any Exception from training. + """ + # Set the stop event for the training thread to gracefully exit. + self.stop_event.set() + + # Release the lock so that training thread can process this event. + self.continue_lock.release() + + # Force a final (blocking) sync of artifacts in the trial path to storage. + self.storage.persist_artifacts(force=True) + + # Wait for training to finish. + # This will raise any errors that occur during training, including SystemError + # This returns the result of the training function. + output = None + if self.training_started: + output = self.training_thread.join(timeout=timeout) + + return output + + def get_next(self) -> Optional[_TrainingResult]: + """Gets the next ``_TrainingResult`` from the result queue. + + If the result queue is empty, then this function returns ``None``. + """ + if not self.training_started: + raise RuntimeError("Please call start before calling get_next.") + + if self.synchronous_result_reporting: + # There's no need to release the lock on the first report + # since `start` already started the training thread. + if not self._first_report: + # Release the lock to trigger training to continue, + # until the next call to report. + self.continue_lock.release() + self._first_report = False + + result = None + # While training is still ongoing, attempt to get the result. + while result is None and self.training_thread.is_alive(): + try: + result = self.result_queue.get( + block=True, timeout=_RESULT_FETCH_TIMEOUT + ) + except queue.Empty: + pass + + # If no result was found, then the runner must no longer be alive. + if result is None: + # Try one last time to fetch results in case results were + # reported in between the time of the last check and the + # termination of the thread runner. + try: + result = self.result_queue.get( + block=False, timeout=_RESULT_FETCH_TIMEOUT + ) + except queue.Empty: + pass + + # check if error occurred inside the thread runner. + if result is None: + # only raise an error from the runner if all results are consumed + self._report_thread_runner_error(block=True) + else: + if not self.error_queue.empty(): + logger.debug( + ( + "Runner error waiting to be raised in main thread. " + "Logging all available results first." + ) + ) + + if not self.synchronous_result_reporting: + # At this point, the training thread has reached + # the `train.report` and is blocked there. + # If performing asynchronous result reporting, + # release the lock to allow each worker to keep training + # immediately after the coordinator fetches their result. + self.continue_lock.release() + + # Return None if there are no more results to fetch. + return result + + def _auto_fill_metrics(self, result: dict) -> dict: + """Add autofilled metrics and update attributes.""" + current_time = time.time() + current_datetime = datetime.now() + if TIME_THIS_ITER_S in result: + time_this_iter = result[TIME_THIS_ITER_S] + else: + time_this_iter = current_time - self.last_report_time + self.iteration += 1 + self.time_total += time_this_iter + self.last_report_time = current_time + + auto_filled_metrics = { + TIMESTAMP: int(time.mktime(current_datetime.timetuple())), + TIME_TOTAL_S: self.time_total, + WORKER_PID: os.getpid(), + WORKER_HOSTNAME: platform.node(), + WORKER_NODE_IP: self.local_ip, + } + + if not self.detailed_autofilled_metrics: + auto_filled_metrics = { + k: v + for k, v in auto_filled_metrics.items() + if k not in DETAILED_AUTOFILLED_KEYS + } + + result = result.copy() + result.update(auto_filled_metrics) + return result + + def _auto_fill_checkpoint_metrics(self, result: dict) -> dict: + """Add autofilled metrics and update attributes.""" + current_datetime = datetime.now() + + auto_filled_metrics = { + TIMESTAMP: int(time.mktime(current_datetime.timetuple())) + } + result = result.copy() + result.update(auto_filled_metrics) + return result + + def _report_thread_runner_error(self, block=False): + try: + e = self.error_queue.get(block=block, timeout=_ERROR_FETCH_TIMEOUT) + raise StartTraceback from e + except queue.Empty: + pass + + def _report_training_result(self, training_result: _TrainingResult) -> None: + """Place a training result on the result queue for the main thread to process, + then block until the main thread signals that training should continue. + + NOTE: This is used internally to report results from Train to Tune + without persisting checkpoints to storage 2 times. + `report` is the public API that directly persists to storage, which + should only be called by user code. + """ + if training_result.checkpoint: + # NOTE: This populates `train.get_checkpoint` + self.loaded_checkpoint = training_result.checkpoint + + # Add result to a thread-safe queue. + self.result_queue.put(training_result, block=True) + + # Acquire lock to stop the training thread until main thread + # triggers resume. + self.continue_lock.acquire() + + # If the trial should be terminated, exit gracefully. + # NOTE: This is only really useful if `synchronous_result_reporting=True`. + # Otherwise, the lock is immediately released on reporting, and this + # check is skipped before the main thread decides to set the stop event. + if self.stop_event.is_set(): + self.stop_event.clear() + sys.exit(0) + + def report(self, metrics: Dict, checkpoint: Optional[Checkpoint] = None) -> None: + # Special case: early fail for Torch tensors + if "torch" in sys.modules: + from ray.air._internal.torch_utils import contains_tensor + + if contains_tensor(metrics): + raise ValueError( + "Passing objects containg Torch tensors as metrics " + "is not supported as it will throw an exception on " + "deserialization. You can either convert the tensors " + "to Python objects or report a `train.Checkpoint` " + "with `ray.train.report` to store your Torch objects." + ) + + if self.ignore_report: + return + + metrics = self._auto_fill_metrics(metrics) + + persisted_checkpoint = None + if checkpoint: + self.storage._update_checkpoint_index(metrics) + + # Persist the reported checkpoint files to storage. + persisted_checkpoint = self.storage.persist_current_checkpoint(checkpoint) + + metrics[CHECKPOINT_DIR_NAME] = self.storage.checkpoint_dir_name + else: + metrics[CHECKPOINT_DIR_NAME] = None + + # Persist trial artifacts to storage. + force_artifact_sync = ( + persisted_checkpoint + and self.storage.sync_config.sync_artifacts_on_checkpoint + ) + self.storage.persist_artifacts(force=force_artifact_sync) + + # Set additional user metadata from the Trainer. + if persisted_checkpoint and self.metadata: + user_metadata = persisted_checkpoint.get_metadata() + for k, v in self.metadata.items(): + # Update keys not already set by the user. This gives user-set keys + # precedence over keys set at the Trainer level. + if k not in user_metadata: + user_metadata[k] = v + persisted_checkpoint.set_metadata(user_metadata) + + result = _TrainingResult(checkpoint=persisted_checkpoint, metrics=metrics) + + self._report_training_result(result) + + @property + def experiment_name(self) -> str: + return self.trial_info.experiment_name + + @property + def trial_name(self) -> str: + return self.trial_info.name + + @property + def trial_id(self) -> str: + return self.trial_info.id + + @property + def run_id(self) -> str: + return self.trial_info.run_id + + @property + def trial_resources(self) -> "PlacementGroupFactory": + return self.trial_info.resources + + @property + def trial_dir(self) -> str: + return self.trial_info.logdir + + def get_dataset_shard( + self, + dataset_name: Optional[str] = None, + ) -> Optional["DataIterator"]: + shard = self.dataset_shard + if shard is None: + warnings.warn( + "No dataset passed in. Returning None. Make sure to " + "pass in a Dataset to Trainer.run to use this " + "function." + ) + elif isinstance(shard, dict): + if not dataset_name: + raise RuntimeError( + "Multiple datasets were passed into ``Trainer``, " + "but no ``dataset_name`` is passed into " + "``get_dataset_shard``. Please specify which " + "dataset shard to retrieve." + ) + return shard.get(dataset_name) + return shard + + +# Cache of resource dicts that have been checked by the launch hook already. +_checked_resources: Set[frozenset] = set() + +# Global _TrainSession object initialized by Ray Tune function trainables +# and Ray Train V1 workers. +_session: Optional[_TrainSession] = None + + +def _tune_task_and_actor_launch_hook( + fn, resources: Dict[str, float], strategy: Optional[SchedulingStrategyT] +): + """Launch hook to catch nested tasks that can't fit in the placement group. + + This gives users a nice warning in case they launch a nested task in a Tune trial + without reserving resources in the trial placement group to fit it. + """ + + # Already checked, skip for performance reasons. + key = frozenset({(k, v) for k, v in resources.items() if v > 0}) + if not key or key in _checked_resources: + return + + # No need to check if placement group is None. + if ( + not isinstance(strategy, PlacementGroupSchedulingStrategy) + or strategy.placement_group is None + ): + return + + # Check if the resource request is targeting the current placement group. + cur_pg = ray.util.get_current_placement_group() + if not cur_pg or strategy.placement_group.id != cur_pg.id: + return + + _checked_resources.add(key) + + # Check if the request can be fulfilled by the current placement group. + pgf = get_trial_resources() + + if pgf.head_bundle_is_empty: + available_bundles = cur_pg.bundle_specs[0:] + else: + available_bundles = cur_pg.bundle_specs[1:] + + # Check if the request can be fulfilled by the current placement group. + if _valid_resource_shape(resources, available_bundles): + return + + if fn.class_name: + submitted = "actor" + name = fn.module_name + "." + fn.class_name + "." + fn.function_name + else: + submitted = "task" + name = fn.module_name + "." + fn.function_name + + # Normalize the resource spec so it looks the same as the placement group bundle. + main_resources = cur_pg.bundle_specs[0] + resources = {k: float(v) for k, v in resources.items() if v > 0} + + raise RuntimeError( + f"No trial resources are available for launching the {submitted} `{name}`. " + "To resolve this, specify the Tune option:\n\n" + "> resources_per_trial=tune.PlacementGroupFactory(\n" + f"> [{main_resources}] + [{resources}] * N\n" + "> )\n\n" + f"Where `N` is the number of slots to reserve for trial {submitted}s. " + "If you are using a Ray training library, there might be a utility function " + "to set this automatically for you. For more information, refer to " + "https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html" + ) + + +def init_session(*args, **kwargs) -> None: + global _session + if _session: + raise ValueError( + "A Train session is already in use. Do not call " + "`init_session()` manually." + ) + + # Setup hooks for generating placement group resource deadlock warnings. + from ray import actor, remote_function + + if "TUNE_DISABLE_RESOURCE_CHECKS" not in os.environ: + actor._actor_launch_hook = _tune_task_and_actor_launch_hook + remote_function._task_launch_hook = _tune_task_and_actor_launch_hook + + _session = _TrainSession(*args, **kwargs) + + +def get_session() -> Optional[_TrainSession]: + return _session + + +def shutdown_session(): + """Shuts down the initialized session.""" + global _session + _session = None + + +def _raise_accelerator_session_misuse(): + """Raises a SessionMisuseError because a utility function was used improperly.""" + raise SessionMisuseError( + "prepare/accelerate utility functions should be called inside a training " + "function executed by `Trainer.run`" + ) + + +def get_accelerator(default_accelerator_cls: Type[Accelerator]) -> Accelerator: + """The accelerator for this training session. + + If an accelerator has not been set, then this method will construct an + accelerator using the provided accelerator class. + + Raises: + SessionMisuseError: if the session is uninitialized. + """ + session = get_session() + if session is None: + _raise_accelerator_session_misuse() + if session.accelerator is None: + session.accelerator = default_accelerator_cls() + return session.accelerator + + +def set_accelerator(accelerator: Accelerator) -> None: + """Sets the accelerator for this training session. + + Args: + accelerator: The accelerator to use for training. + + Raises: + SessionMisuseError: if the session is unitialized. + RuntimeError: if the accelerator has already been set. + """ + session = get_session() + if session is None: + _raise_accelerator_session_misuse() + if session.accelerator is not None: + raise RuntimeError("Cannot change accelerator once set.") + session.accelerator = accelerator + + +def _warn_session_misuse(default_value: Any = None): + """Warns if fn is being used outside of session and returns ``default_value``.""" + + def inner(fn: Callable): + fn_name = fn.__name__ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + session = get_session() + if not session: + if log_once(f"{SESSION_MISUSE_LOG_ONCE_KEY}-{fn_name}"): + warnings.warn( + f"`{fn_name}` is meant to only be " + "called inside a function that is executed by a Tuner" + f" or Trainer. Returning `{default_value}`." + ) + return default_value + return fn(*args, **kwargs) + + return wrapper + + return inner + + +@PublicAPI(stability="stable") +@_warn_session_misuse() +def report(metrics: Dict, *, checkpoint: Optional[Checkpoint] = None) -> None: + """Report metrics and optionally save a checkpoint. + + If a checkpoint is provided, it will be + :ref:`persisted to storage `. + + If this is called in multiple distributed training workers: + + - Only the metrics reported by the rank 0 worker will be tracked by Ray Train. + See :ref:`the metrics logging guide `. + - A checkpoint will be registered as long as one or more workers reports + checkpoint that is not None. + See the :ref:`checkpointing guide `. + - Checkpoints from multiple workers will be merged into one directory + in persistent storage. + See :ref:`the distributed checkpointing guide `. + + .. note:: + + Each invocation of this method will automatically increment the underlying + ``training_iteration`` number. The physical meaning of this "iteration" is + defined by user depending on how often they call ``report``. + It does not necessarily map to one epoch. + + .. warning:: + + All workers must call `ray.train.report` the same number of times + so that Ray Train can properly synchronize the training state across + workers. Otherwise, your training will hang. + + .. warning:: + + This method does NOT act as a barrier for distributed training workers. + Workers will upload their checkpoint, then continue training immediately. + If you need to synchronize workers, you can use a framework-native barrier + such as `torch.distributed.barrier()`. + + Example: + + .. testcode:: + + import tempfile + + from ray import train + from ray.train import Checkpoint + from ray.train.torch import TorchTrainer + + + def train_func(config): + start_epoch = 0 + checkpoint = train.get_checkpoint() + if checkpoint: + with checkpoint.as_directory() as checkpoint_dir: + # Load back training state + ... + + for epoch in range(start_epoch, config.get("num_epochs", 10)): + # Do training... + + metrics = {"loss": ...} + + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + # Save the checkpoint... + # torch.save(...) + + checkpoint = Checkpoint.from_directory(temp_checkpoint_dir) + + # Example: Only the rank 0 worker uploads the checkpoint. + if ray.train.get_context().get_world_rank() == 0: + train.report(metrics, checkpoint=checkpoint) + else: + train.report(metrics, checkpoint=None) + + trainer = TorchTrainer( + train_func, scaling_config=train.ScalingConfig(num_workers=2) + ) + + Args: + metrics: The metrics you want to report. + checkpoint: The optional checkpoint you want to report. + """ + # If we are running in a Tune function, switch to `ray.tune.report`. + from ray.tune.trainable.trainable_fn_utils import _in_tune_session + + if _in_tune_session(): + import ray.tune + + if _v2_migration_warnings_enabled(): + _log_deprecation_warning( + "`ray.train.report` should be switched to " + "`ray.tune.report` when running in a function " + "passed to Ray Tune. This will be an error in the future." + ) + return ray.tune.report(metrics, checkpoint=checkpoint) + + get_session().report(metrics, checkpoint=checkpoint) + + +@PublicAPI(stability="stable") +@_warn_session_misuse() +def get_checkpoint() -> Optional[Checkpoint]: + """Access the latest reported checkpoint to resume from if one exists. + + Example: + + .. testcode:: + + import tempfile + + from ray import train + from ray.train import Checkpoint + from ray.train.torch import TorchTrainer + + + def train_func(config): + start_epoch = 0 + checkpoint = train.get_checkpoint() + if checkpoint: + with checkpoint.as_directory() as checkpoint_dir: + # Load back training state + ... + + for epoch in range(start_epoch, config.get("num_epochs", 10)): + # Do training... + + metrics = {"loss": ...} + + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + # Save the checkpoint... + + checkpoint = Checkpoint.from_directory(temp_checkpoint_dir) + train.report(metrics, checkpoint=checkpoint) + + trainer = TorchTrainer( + train_func, scaling_config=train.ScalingConfig(num_workers=2) + ) + + Returns: + Checkpoint object if the session is currently being resumed. + Otherwise, return None. + """ + # If we are running in a Tune function, switch to `ray.tune.get_checkpoint`. + from ray.tune.trainable.trainable_fn_utils import _in_tune_session + + if _in_tune_session(): + import ray.tune + + if _v2_migration_warnings_enabled(): + _log_deprecation_warning( + "`ray.train.get_checkpoint` should be switched to " + "`ray.tune.get_checkpoint` when running in a function " + "passed to Ray Tune. This will be an error in the future." + ) + return ray.tune.get_checkpoint() + + return get_session().loaded_checkpoint + + +@PublicAPI(stability="beta") +@_warn_session_misuse() +def get_metadata() -> Dict[str, Any]: + """User metadata dict passed to the Trainer constructor.""" + return get_session().metadata + + +@PublicAPI(stability="beta") +@_warn_session_misuse() +def get_experiment_name() -> str: + """Experiment name for the corresponding trial.""" + return get_session().experiment_name + + +@PublicAPI(stability="beta") +@_warn_session_misuse() +def get_trial_name() -> str: + """Trial name for the corresponding trial.""" + return get_session().trial_name + + +@PublicAPI(stability="beta") +@_warn_session_misuse() +def get_trial_id() -> str: + """Trial id for the corresponding trial.""" + return get_session().trial_id + + +@PublicAPI(stability="alpha") +@_warn_session_misuse() +def get_run_id() -> str: + """Unique Train Run id for the corresponding trial.""" + return get_session().run_id + + +@PublicAPI(stability="beta") +@_warn_session_misuse() +def get_trial_resources() -> "PlacementGroupFactory": + """Trial resources for the corresponding trial.""" + return get_session().trial_resources + + +@PublicAPI(stability="beta") +@_warn_session_misuse() +def get_trial_dir() -> str: + """Log directory corresponding to the trial directory for a Tune session. + If calling from a Train session, this will give the trial directory of its parent + Tune session. + + .. testcode:: + + from ray import train, tune + + def train_func(config): + print(train.get_context().get_trial_dir()) + + tuner = tune.Tuner(train_func) + tuner.fit() + + .. testoutput:: + :options: +MOCK + + /Users/root/ray_results/train_func_2023-07-19_15-01-37/train_func_d620c_00000_0_2023-07-19_15-01-40 + """ + return get_session().trial_dir + + +@PublicAPI(stability="beta") +@_warn_session_misuse(default_value=1) +def get_world_size() -> int: + """Get the current world size (i.e. total number of workers) for this run. + + .. testcode:: + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.tensorflow import TensorflowTrainer + + NUM_WORKERS = 2 + + def train_loop_per_worker(config): + assert train.get_context().get_world_size() == NUM_WORKERS + + train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + trainer = TensorflowTrainer( + train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=NUM_WORKERS), + datasets={"train": train_dataset} + ) + trainer.fit() + + .. testoutput:: + :hide: + + ... + """ + session = get_session() + if not hasattr(session, "world_size"): + raise RuntimeError( + "`get_world_size` can only be called for TrainSession! " + "Make sure you only use that in `train_loop_per_worker` function" + "that is passed into `DataParallelTrainer`." + ) + return session.world_size + + +@PublicAPI(stability="beta") +@_warn_session_misuse(default_value=0) +def get_world_rank() -> int: + """Get the world rank of this worker. + + .. testcode:: + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.tensorflow import TensorflowTrainer + + def train_loop_per_worker(config): + if train.get_context().get_world_rank() == 0: + print("Worker 0") + + train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + trainer = TensorflowTrainer( + train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=2), + datasets={"train": train_dataset} + ) + trainer.fit() + + .. testoutput:: + :hide: + + ... + """ + session = get_session() + if not hasattr(session, "world_rank"): + raise RuntimeError( + "`get_world_rank` can only be called for TrainSession! " + "Make sure you only use that in `train_loop_per_worker` function" + "that is passed into `DataParallelTrainer`." + ) + return session.world_rank + + +@PublicAPI(stability="beta") +@_warn_session_misuse(default_value=0) +def get_local_rank() -> int: + """Get the local rank of this worker (rank of the worker on its node). + + .. testcode:: + + import torch + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.torch import TorchTrainer + + def train_loop_per_worker(config): + if torch.cuda.is_available(): + torch.cuda.set_device(train.get_context().get_local_rank()) + ... + + train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + trainer = TorchTrainer( + train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=2, use_gpu=True), + datasets={"train": train_dataset} + ) + trainer.fit() + + .. testoutput:: + :hide: + + ... + """ + session = get_session() + if not hasattr(session, "local_rank"): + raise RuntimeError( + "`get_local_rank` can only be called for TrainSession! " + "Make sure you only use that in `train_loop_per_worker` function" + "that is passed into `DataParallelTrainer`." + ) + return session.local_rank + + +@PublicAPI(stability="beta") +@_warn_session_misuse(default_value=0) +def get_local_world_size() -> int: + """Get the local world size of this node (i.e. number of workers on this node). + + Example: + + .. testcode:: + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.torch import TorchTrainer + + def train_loop_per_worker(): + print(train.get_context().get_local_world_size()) + + train_dataset = ray.data.from_items( + [{"x": x, "y": x + 1} for x in range(32)]) + trainer = TorchTrainer(train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=1), + datasets={"train": train_dataset}) + trainer.fit() + + .. testoutput:: + :hide: + + ... + """ + session = get_session() + if not hasattr(session, "local_world_size"): + raise RuntimeError( + "`get_local_world_size` can only be called for TrainSession! " + "Make sure you only use that in `train_loop_per_worker` function" + "that is passed into `DataParallelTrainer`." + ) + return session.local_world_size + + +@PublicAPI(stability="beta") +@_warn_session_misuse(default_value=0) +def get_node_rank() -> int: + """Get the rank of this node. + + Example: + + .. testcode:: + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.torch import TorchTrainer + + def train_loop_per_worker(): + print(train.get_context().get_node_rank()) + + train_dataset = ray.data.from_items( + [{"x": x, "y": x + 1} for x in range(32)]) + trainer = TorchTrainer(train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=1), + datasets={"train": train_dataset}) + trainer.fit() + + .. testoutput:: + :hide: + + ... + """ + session = get_session() + if not hasattr(session, "node_rank"): + raise RuntimeError( + "`get_node_rank` can only be called for TrainSession! " + "Make sure you only use that in `train_loop_per_worker` function" + "that is passed into `DataParallelTrainer`." + ) + return session.node_rank + + +@PublicAPI(stability="stable") +@_warn_session_misuse() +def get_dataset_shard( + dataset_name: Optional[str] = None, +) -> Optional["DataIterator"]: + """Returns the :class:`ray.data.DataIterator` shard for this worker. + + Call :meth:`~ray.data.DataIterator.iter_torch_batches` or + :meth:`~ray.data.DataIterator.to_tf` on this shard to convert it to the + appropriate framework-specific data type. + + .. testcode:: + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.torch import TorchTrainer + + def train_loop_per_worker(config): + ... + for epoch in range(2): + # Trainer will automatically handle sharding. + data_shard = train.get_dataset_shard("train") + for batch in data_shard.iter_torch_batches(): + ... + + train_dataset = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + trainer = TorchTrainer( + train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=2), + datasets={"train": train_dataset} + ) + trainer.fit() + + .. testoutput:: + :hide: + + ... + + Args: + dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then + specifies which dataset shard to return. + + Returns: + The ``DataIterator`` shard to use for this worker. + If no dataset is passed into Trainer, then return None. + """ + session = get_session() + if not hasattr(session, "get_dataset_shard"): + raise RuntimeError( + "`get_dataset_shard` can only be called for TrainSession! " + "Make sure you only use that in `train_loop_per_worker` function" + "that is passed into `DataParallelTrainer`." + ) + return session.get_dataset_shard(dataset_name) + + +@DeveloperAPI +@_warn_session_misuse() +def get_storage() -> StorageContext: + """Returns the :class:`~ray.train._internal.storage.StorageContext` storage + context which gives advanced access to the filesystem and paths + configured through `RunConfig`. + + NOTE: This is a developer API, and the `StorageContext` interface may change + without notice between minor versions. + """ + return get_session().storage diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..604a4fa3932930fa1d728ceb6e9bdd4619449bf6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__init__.py @@ -0,0 +1,14 @@ +from ray.train._internal.state.state_manager import TrainRunStateManager + +try: + import pydantic # noqa: F401 +except ImportError: + raise ModuleNotFoundError( + "pydantic isn't installed." + "To install pydantic, please run 'pip install pydantic'" + ) + + +__all__ = [ + "TrainRunStateManager", +] diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94b4bd7b07cc941dba6049ce0133eb70db7a52a3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8a30db76ecf2075376e6b55685fbc0494ede08d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/schema.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..765fc521e4ef2d1dda66317e29560619ba0bb1e9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_actor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b5cfda81aecb3eddd7678bceecac47adc66fac6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/__pycache__/state_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py new file mode 100644 index 0000000000000000000000000000000000000000..48a4d7e4b2e0b2ae545c14dbe8020372c1d35f6d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/schema.py @@ -0,0 +1,158 @@ +from enum import Enum +from typing import List, Optional + +from ray._private.pydantic_compat import BaseModel, Field +from ray.dashboard.modules.job.pydantic_models import JobDetails +from ray.util.annotations import DeveloperAPI + +MAX_ERROR_STACK_TRACE_LENGTH = 50000 + + +@DeveloperAPI +class RunStatusEnum(str, Enum): + """Enumeration for the status of a train run.""" + + # (Deprecated) Replaced by RUNNING. + # The train run has started + STARTED = "STARTED" + # The train run is running + RUNNING = "RUNNING" + # The train run was terminated as expected + FINISHED = "FINISHED" + # The train run was terminated early due to errors in the training function + ERRORED = "ERRORED" + # The train run was terminated early due to system errors or controller errors + ABORTED = "ABORTED" + + +@DeveloperAPI +class ActorStatusEnum(str, Enum): + DEAD = "DEAD" + ALIVE = "ALIVE" + + +@DeveloperAPI +class TrainWorkerInfo(BaseModel): + """Metadata of a Ray Train worker.""" + + actor_id: str = Field(description="Actor ID of the worker.") + world_rank: int = Field(description="World rank of the worker.") + local_rank: int = Field(description="Local rank of the worker.") + node_rank: int = Field(description="Node rank of the worker.") + node_id: str = Field(description="ID of the node that the worker is running on.") + node_ip: str = Field( + description="IP address of the node that the worker is running on." + ) + pid: int = Field(description="Process ID of the worker.") + gpu_ids: List[int] = Field( + description="A list of GPU ids allocated to that worker." + ) + status: Optional[ActorStatusEnum] = Field( + description="The status of the train worker actor. It can be ALIVE or DEAD." + ) + + +@DeveloperAPI +class MemoryInfo(BaseModel): + rss: int + vms: int + pfaults: Optional[int] + pageins: Optional[int] + + +@DeveloperAPI +class ProcessStats(BaseModel): + cpuPercent: float + # total memory, free memory, memory used ratio + mem: Optional[List[int]] + memoryInfo: MemoryInfo + + +class ProcessGPUUsage(BaseModel): + # This gpu usage stats from a process + pid: int + gpuMemoryUsage: int + + +@DeveloperAPI +class GPUStats(BaseModel): + uuid: str + index: int + name: str + utilizationGpu: Optional[float] + memoryUsed: float + memoryTotal: float + processInfo: ProcessGPUUsage + + +@DeveloperAPI +class TrainWorkerInfoWithDetails(TrainWorkerInfo): + """Metadata of a Ray Train worker.""" + + processStats: Optional[ProcessStats] = Field( + None, description="Process stats of the worker." + ) + gpus: List[GPUStats] = Field( + default_factory=list, + description=( + "GPU stats of the worker. " + "Only returns GPUs that are attached to the worker process." + ), + ) + + +@DeveloperAPI +class TrainDatasetInfo(BaseModel): + name: str = Field( + description="The key of the dataset dict specified in Ray Train Trainer." + ) + dataset_uuid: str = Field(description="The uuid of the dataset.") + dataset_name: Optional[str] = Field(description="The name of the dataset.") + + +@DeveloperAPI +class TrainRunInfo(BaseModel): + """Metadata for a Ray Train run and information about its workers.""" + + name: str = Field(description="The name of the Train run.") + id: str = Field(description="The unique identifier for each Train run.") + job_id: str = Field(description="The Ray Job ID.") + controller_actor_id: str = Field(description="Actor Id of the Train controller.") + workers: List[TrainWorkerInfo] = Field( + description="A List of Train workers sorted by global ranks." + ) + datasets: List[TrainDatasetInfo] = Field( + description="A List of dataset info for this Train run." + ) + run_status: RunStatusEnum = Field( + description="The current status of the train run. It can be one of the " + "following: RUNNING, FINISHED, ERRORED, or ABORTED." + ) + status_detail: str = Field( + description="Detailed information about the current run status, " + "such as error messages." + ) + start_time_ms: int = Field( + description="The UNIX timestamp of the start time of this Train run." + ) + end_time_ms: Optional[int] = Field( + description="The UNIX timestamp of the end time of this Train run. " + "If null, the Train run has not ended yet." + ) + + +@DeveloperAPI +class TrainRunInfoWithDetails(TrainRunInfo): + """Metadata for a Ray Train run and information about its workers.""" + + workers: List[TrainWorkerInfoWithDetails] = Field( + description="A List of Train workers sorted by global ranks." + ) + job_details: Optional[JobDetails] = Field( + None, description="Details of the job that started this Train run." + ) + + +@DeveloperAPI +class TrainRunsResponse(BaseModel): + train_runs: List[TrainRunInfoWithDetails] diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py new file mode 100644 index 0000000000000000000000000000000000000000..6404eb231ca193eac315b801587412313575650d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_actor.py @@ -0,0 +1,62 @@ +import logging +import threading +from typing import Dict, Optional + +import ray +from ray.actor import ActorHandle +from ray.train._internal.state.schema import TrainRunInfo + +logger = logging.getLogger(__name__) + + +@ray.remote(num_cpus=0) +class TrainStateActor: + def __init__(self): + self._run_infos: Dict[str, TrainRunInfo] = {} + + def register_train_run(self, run_info: TrainRunInfo) -> None: + # Register a new train run. + self._run_infos[run_info.id] = run_info + + def get_train_run(self, run_id: str) -> Optional[TrainRunInfo]: + # Retrieve a registered run with its id + return self._run_infos.get(run_id, None) + + def get_all_train_runs(self) -> Dict[str, TrainRunInfo]: + # Retrieve all registered train runs + return self._run_infos + + +TRAIN_STATE_ACTOR_NAME = "train_state_actor" +TRAIN_STATE_ACTOR_NAMESPACE = "_train_state_actor" + +_state_actor_lock: threading.RLock = threading.RLock() + + +def get_or_create_state_actor() -> ActorHandle: + """Get or create a `TrainStateActor` on the head node.""" + with _state_actor_lock: + state_actor = TrainStateActor.options( + name=TRAIN_STATE_ACTOR_NAME, + namespace=TRAIN_STATE_ACTOR_NAMESPACE, + get_if_exists=True, + lifetime="detached", + resources={"node:__internal_head__": 0.001}, + # Escape from the parent's placement group + scheduling_strategy="DEFAULT", + ).remote() + + # Ensure the state actor is ready + ray.get(state_actor.__ray_ready__.remote()) + return state_actor + + +def get_state_actor() -> Optional[ActorHandle]: + """Get the `TrainStateActor` if exists, otherwise return None.""" + try: + return ray.get_actor( + name=TRAIN_STATE_ACTOR_NAME, + namespace=TRAIN_STATE_ACTOR_NAMESPACE, + ) + except ValueError: + return None diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..e7183714fc72f7a5b864d0f50fd18b725d0c2413 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/state/state_manager.py @@ -0,0 +1,126 @@ +import logging +import os +from collections import defaultdict +from typing import Any, Dict + +import ray +from ray.data import Dataset +from ray.train._internal.state.schema import ( + RunStatusEnum, + TrainDatasetInfo, + TrainRunInfo, + TrainWorkerInfo, +) +from ray.train._internal.utils import check_for_failure +from ray.train._internal.worker_group import WorkerGroup + +logger = logging.getLogger(__name__) + + +class TrainRunStateManager: + """A class that aggregates and reports train run info to TrainStateActor. + + This manager class is created on the train controller layer for each run. + """ + + def __init__(self, state_actor) -> None: + self.state_actor = state_actor + self.train_run_info_dict = defaultdict(dict) + + def register_train_run( + self, + run_id: str, + job_id: str, + run_name: str, + run_status: str, + controller_actor_id: str, + datasets: Dict[str, Dataset], + worker_group: WorkerGroup, + start_time_ms: float, + status_detail: str = "", + ) -> None: + """Collect Train Run Info and report to StateActor.""" + + if not self.state_actor: + logger.warning( + "Unable to register train run since `TrainStateActor` is not started." + ) + return + + def collect_train_worker_info(): + train_context = ray.train.get_context() + core_context = ray.runtime_context.get_runtime_context() + + return TrainWorkerInfo( + world_rank=train_context.get_world_rank(), + local_rank=train_context.get_local_rank(), + node_rank=train_context.get_node_rank(), + actor_id=core_context.get_actor_id(), + node_id=core_context.get_node_id(), + node_ip=ray.util.get_node_ip_address(), + gpu_ids=ray.get_gpu_ids(), + pid=os.getpid(), + ) + + futures = [ + worker_group.execute_single_async(index, collect_train_worker_info) + for index in range(len(worker_group)) + ] + success, exception = check_for_failure(futures) + + if not success: + logger.error( + "Failed to collect run information from the Ray Train " + f"workers:\n{exception}" + ) + return + + worker_info_list = ray.get(futures) + worker_info_list = sorted(worker_info_list, key=lambda info: info.world_rank) + + dataset_info_list = [ + TrainDatasetInfo( + name=ds_name, + dataset_name=ds._plan._dataset_name, + dataset_uuid=ds._plan._dataset_uuid, + ) + for ds_name, ds in datasets.items() + ] + + updates = dict( + id=run_id, + job_id=job_id, + name=run_name, + controller_actor_id=controller_actor_id, + workers=worker_info_list, + datasets=dataset_info_list, + start_time_ms=start_time_ms, + run_status=run_status, + status_detail=status_detail, + ) + + # Clear the cached info to avoid registering the same run twice + self.train_run_info_dict[run_id] = {} + self._update_train_run_info(run_id, updates) + + def end_train_run( + self, + run_id: str, + run_status: RunStatusEnum, + status_detail: str, + end_time_ms: int, + ): + """Update the train run status when the training is finished.""" + updates = dict( + run_status=run_status, + status_detail=status_detail, + end_time_ms=end_time_ms, + ) + self._update_train_run_info(run_id, updates) + + def _update_train_run_info(self, run_id: str, updates: Dict[str, Any]) -> None: + """Update specific fields of a registered TrainRunInfo instance.""" + if run_id in self.train_run_info_dict: + self.train_run_info_dict[run_id].update(updates) + train_run_info = TrainRunInfo(**self.train_run_info_dict[run_id]) + ray.get(self.state_actor.register_train_run.remote(train_run_info)) diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py new file mode 100644 index 0000000000000000000000000000000000000000..05970988862e371b2f080fb92e245a116a4596bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/storage.py @@ -0,0 +1,725 @@ +# Try import ray[train] core requirements (defined in setup.py) +# isort: off +try: + import fsspec # noqa + from fsspec.implementations.local import LocalFileSystem + +except (ImportError, ModuleNotFoundError) as e: + raise RuntimeError( + "fsspec is a required dependency of Ray Train and Ray Tune. " + "Please install with: `pip install fsspec`" + ) from e + +try: + import pyarrow + import pyarrow.fs + +except (ImportError, ModuleNotFoundError) as e: + raise RuntimeError( + "pyarrow is a required dependency of Ray Train and Ray Tune. " + "Please install with: `pip install pyarrow`" + ) from e + +try: + # check if Arrow has S3 support + from pyarrow.fs import S3FileSystem +except ImportError: + S3FileSystem = None +# isort: on + +import fnmatch +import logging +import os +import shutil +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union + +from ray.air._internal.filelock import TempFileLock +from ray.train._internal.syncer import SyncConfig, Syncer, _BackgroundSyncer +from ray.train.constants import _get_ray_train_session_dir +from ray.util.annotations import DeveloperAPI + +if TYPE_CHECKING: + from ray.train._checkpoint import Checkpoint + + +logger = logging.getLogger(__name__) + + +_VALIDATE_STORAGE_MARKER_FILENAME = ".validate_storage_marker" + + +class _ExcludingLocalFilesystem(LocalFileSystem): + """LocalFileSystem wrapper to exclude files according to patterns. + + Args: + root_path: Root path to strip when matching with the exclude pattern. + Ex: root_path="/tmp/a/b/c", exclude=["*a*"], will exclude + /tmp/a/b/c/_a_.txt but not ALL of /tmp/a/*. + exclude: List of patterns that are applied to files returned by + ``self.find()``. If a file path matches this pattern, it will + be excluded. + + """ + + def __init__(self, root_path: Path, exclude: List[str], **kwargs): + super().__init__(**kwargs) + self._exclude = exclude + self._root_path = root_path + + @property + def fsid(self): + return "_excluding_local" + + def _should_exclude(self, path: str) -> bool: + """Return True if `path` (relative to `root_path`) matches any of the + `self._exclude` patterns.""" + path = Path(path) + relative_path = path.relative_to(self._root_path).as_posix() + match_candidates = [relative_path] + if path.is_dir(): + # Everything is in posix path format ('/') + match_candidates.append(relative_path + "/") + + for excl in self._exclude: + if any(fnmatch.fnmatch(candidate, excl) for candidate in match_candidates): + return True + return False + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + """Call parent find() and exclude from result.""" + paths = super().find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs + ) + if detail: + return { + path: out + for path, out in paths.items() + if not self._should_exclude(path) + } + else: + return [path for path in paths if not self._should_exclude(path)] + + +def _pyarrow_fs_copy_files( + source, destination, source_filesystem=None, destination_filesystem=None, **kwargs +): + if S3FileSystem and isinstance(destination_filesystem, pyarrow.fs.S3FileSystem): + # Workaround multi-threading issue with pyarrow. Note that use_threads=True + # is safe for download, just not for uploads, see: + # https://github.com/apache/arrow/issues/32372 + kwargs.setdefault("use_threads", False) + + # Use a large chunk size to speed up large checkpoint transfers. + kwargs.setdefault("chunk_size", 64 * 1024 * 1024) + + return pyarrow.fs.copy_files( + source, + destination, + source_filesystem=source_filesystem, + destination_filesystem=destination_filesystem, + **kwargs, + ) + + +# TODO(justinvyu): Add unit tests for all these utils. + + +def _delete_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str): + is_dir = _is_directory(fs, fs_path) + + try: + if is_dir: + fs.delete_dir(fs_path) + else: + fs.delete_file(fs_path) + except Exception: + logger.exception(f"Caught exception when deleting path at ({fs}, {fs_path}):") + + +def _download_from_fs_path( + fs: pyarrow.fs.FileSystem, + fs_path: str, + local_path: str, + filelock: bool = True, +): + """Downloads a directory or file from (fs, fs_path) to a local path. + + If fs_path points to a directory: + - The full directory contents are downloaded directly into `local_path`, + rather than to a subdirectory of `local_path`. + + If fs_path points to a file: + - The file is downloaded to `local_path`, which is expected to be a file path. + + If the download fails, the `local_path` contents are + cleaned up before raising, if the directory did not previously exist. + + NOTE: This method creates `local_path`'s parent directories if they do not + already exist. If the download fails, this does NOT clean up all the parent + directories that were created. + + Args: + fs: The filesystem to download from. + fs_path: The filesystem path (either a directory or a file) to download. + local_path: The local path to download to. + filelock: Whether to require a file lock before downloading, useful for + multiple downloads to the same directory that may be happening in parallel. + + Raises: + FileNotFoundError: if (fs, fs_path) doesn't exist. + """ + + _local_path = Path(local_path).resolve() + exists_before = _local_path.exists() + if _is_directory(fs=fs, fs_path=fs_path): + _local_path.mkdir(parents=True, exist_ok=True) + else: + _local_path.parent.mkdir(parents=True, exist_ok=True) + + try: + if filelock: + with TempFileLock(f"{os.path.normpath(local_path)}.lock"): + _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs) + else: + _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs) + except Exception as e: + # Clean up the directory if downloading was unsuccessful + if not exists_before: + shutil.rmtree(local_path, ignore_errors=True) + raise e + + +def _upload_to_fs_path( + local_path: str, + fs: pyarrow.fs.FileSystem, + fs_path: str, + exclude: Optional[List[str]] = None, +) -> None: + """Uploads a local directory or file to (fs, fs_path). + + NOTE: This will create all necessary parent directories at the destination. + + Args: + local_path: The local path to upload. + fs: The filesystem to upload to. + fs_path: The filesystem path where the dir/file will be uploaded to. + exclude: A list of filename matches to exclude from upload. This includes + all files under subdirectories as well. + This pattern will match with the relative paths of all files under + `local_path`. + Ex: ["*.png"] to exclude all .png images. + """ + + if not exclude: + # TODO(justinvyu): uploading a single file doesn't work + # (since we always create a directory at fs_path) + _create_directory(fs=fs, fs_path=fs_path) + _pyarrow_fs_copy_files(local_path, fs_path, destination_filesystem=fs) + return + + _upload_to_uri_with_exclude_fsspec( + local_path=local_path, fs=fs, fs_path=fs_path, exclude=exclude + ) + + +def _upload_to_uri_with_exclude_fsspec( + local_path: str, fs: "pyarrow.fs", fs_path: str, exclude: Optional[List[str]] +) -> None: + local_fs = _ExcludingLocalFilesystem(root_path=local_path, exclude=exclude) + handler = pyarrow.fs.FSSpecHandler(local_fs) + source_fs = pyarrow.fs.PyFileSystem(handler) + + _create_directory(fs=fs, fs_path=fs_path) + _pyarrow_fs_copy_files( + local_path, fs_path, source_filesystem=source_fs, destination_filesystem=fs + ) + + +def _list_at_fs_path( + fs: pyarrow.fs.FileSystem, + fs_path: str, + file_filter: Optional[Callable[[pyarrow.fs.FileInfo], bool]] = None, +) -> List[str]: + """Returns the list of filenames at (fs, fs_path), similar to os.listdir. + + If the path doesn't exist, returns an empty list. + """ + if file_filter is None: + file_filter = lambda x: True # noqa: E731 + + selector = pyarrow.fs.FileSelector(fs_path, allow_not_found=True, recursive=False) + return [ + os.path.relpath(file_info.path.lstrip("/"), start=fs_path.lstrip("/")) + for file_info in fs.get_file_info(selector) + if file_filter(file_info) + ] + + +def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool: + """Returns True if (fs, fs_path) exists.""" + + valid = fs.get_file_info(fs_path) + return valid.type != pyarrow.fs.FileType.NotFound + + +def _is_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> bool: + """Checks if (fs, fs_path) is a directory or a file. + + Raises: + FileNotFoundError: if (fs, fs_path) doesn't exist. + """ + + file_info = fs.get_file_info(fs_path) + if file_info.type == pyarrow.fs.FileType.NotFound: + raise FileNotFoundError(f"Path not found: ({fs}, {fs_path})") + + return not file_info.is_file + + +def _create_directory(fs: pyarrow.fs.FileSystem, fs_path: str) -> None: + """Create directory at (fs, fs_path). + + Some external filesystems require directories to already exist, or at least + the `netloc` to be created (e.g. PyArrows ``mock://`` filesystem). + + Generally this should be done before and outside of Ray applications. This + utility is thus primarily used in testing, e.g. of ``mock://` URIs. + """ + try: + fs.create_dir(fs_path) + except Exception: + logger.exception( + f"Caught exception when creating directory at ({fs}, {fs_path}):" + ) + + +def get_fs_and_path( + storage_path: Union[str, os.PathLike], + storage_filesystem: Optional[pyarrow.fs.FileSystem] = None, +) -> Tuple[pyarrow.fs.FileSystem, str]: + """Returns the fs and path from a storage path and an optional custom fs. + + Args: + storage_path: A storage path or URI. (ex: s3://bucket/path or /tmp/ray_results) + storage_filesystem: A custom filesystem to use. If not provided, + this will be auto-resolved by pyarrow. If provided, the storage_path + is assumed to be prefix-stripped already, and must be a valid path + on the filesystem. + """ + storage_path = str(storage_path) + + if storage_filesystem: + return storage_filesystem, storage_path + + return pyarrow.fs.FileSystem.from_uri(storage_path) + + +class _FilesystemSyncer(_BackgroundSyncer): + """Syncer between local filesystem and a `storage_filesystem`.""" + + def __init__(self, storage_filesystem: Optional["pyarrow.fs.FileSystem"], **kwargs): + self.storage_filesystem = storage_filesystem + super().__init__(**kwargs) + + def _sync_up_command( + self, local_path: str, uri: str, exclude: Optional[List] = None + ) -> Tuple[Callable, Dict]: + # TODO(justinvyu): Defer this cleanup up as part of the + # external-facing Syncer deprecation. + fs_path = uri + return ( + _upload_to_fs_path, + dict( + local_path=local_path, + fs=self.storage_filesystem, + fs_path=fs_path, + exclude=exclude, + ), + ) + + def _sync_down_command(self, uri: str, local_path: str) -> Tuple[Callable, Dict]: + fs_path = uri + return ( + _download_from_fs_path, + dict( + fs=self.storage_filesystem, + fs_path=fs_path, + local_path=local_path, + ), + ) + + def _delete_command(self, uri: str) -> Tuple[Callable, Dict]: + fs_path = uri + return _delete_fs_path, dict(fs=self.storage_filesystem, fs_path=fs_path) + + +@DeveloperAPI +class StorageContext: + """Shared context that holds the source of truth for all paths and + storage utilities, passed along from the driver to workers. + + This object defines a few types of paths: + 1. *_fs_path: A path on the `storage_filesystem`. This is a regular path + which has been prefix-stripped by pyarrow.fs.FileSystem.from_uri and + can be joined with `Path(...).as_posix()`. + 2. *_driver_staging_path: The temporary staging directory on the local filesystem + where driver artifacts are saved to before persisting them to storage. + 3. trial_working_directory: The local filesystem path that the remote + actors' working directories are moved to by default. + This is separated from the driver staging path so that driver syncing + does not implicitly upload the trial working directory, for trials on the + driver node. + + Example with storage_path="mock:///bucket/path?param=1": + + >>> import ray + >>> from ray.train._internal.storage import StorageContext + >>> import os + >>> _ = ray.init() + >>> storage = StorageContext( + ... storage_path="mock://netloc/bucket/path?param=1", + ... experiment_dir_name="exp_name", + ... ) + >>> storage.storage_filesystem # Auto-resolved # doctest: +ELLIPSIS + >> storage.experiment_fs_path + 'bucket/path/exp_name' + >>> storage.experiment_driver_staging_path # doctest: +ELLIPSIS + '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts' + >>> storage.trial_dir_name = "trial_dir" + >>> storage.trial_fs_path + 'bucket/path/exp_name/trial_dir' + >>> storage.trial_driver_staging_path # doctest: +ELLIPSIS + '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts/trial_dir' + >>> storage.trial_working_directory # doctest: +ELLIPSIS + '/tmp/ray/session_.../artifacts/.../exp_name/working_dirs/trial_dir' + >>> storage.current_checkpoint_index = 1 + >>> storage.checkpoint_fs_path + 'bucket/path/exp_name/trial_dir/checkpoint_000001' + >>> ray.shutdown() + + Example with storage_path="/tmp/ray_results": + + >>> from ray.train._internal.storage import StorageContext + >>> storage = StorageContext( + ... storage_path="/tmp/ray_results", + ... experiment_dir_name="exp_name", + ... ) + >>> storage.storage_fs_path + '/tmp/ray_results' + >>> storage.experiment_fs_path + '/tmp/ray_results/exp_name' + >>> storage.storage_filesystem # Auto-resolved # doctest: +ELLIPSIS + " + ) + + def _create_validation_file(self): + """On the creation of a storage context, create a validation file at the + storage path to verify that the storage path can be written to. + This validation file is also used to check whether the storage path is + accessible by all nodes in the cluster.""" + valid_file = Path( + self.experiment_fs_path, _VALIDATE_STORAGE_MARKER_FILENAME + ).as_posix() + self.storage_filesystem.create_dir(self.experiment_fs_path) + with self.storage_filesystem.open_output_stream(valid_file): + pass + + def _check_validation_file(self): + """Checks that the validation file exists at the storage path.""" + valid_file = Path( + self.experiment_fs_path, _VALIDATE_STORAGE_MARKER_FILENAME + ).as_posix() + if not _exists_at_fs_path(fs=self.storage_filesystem, fs_path=valid_file): + raise RuntimeError( + f"Unable to set up cluster storage with the following settings:\n{self}" + "\nCheck that all nodes in the cluster have read/write access " + "to the configured storage path. `RunConfig(storage_path)` should be " + "set to a cloud storage URI or a shared filesystem path accessible " + "by all nodes in your cluster ('s3://bucket' or '/mnt/nfs'). " + "A local path on the head node is not accessible by worker nodes. " + "See: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html" # noqa: E501 + ) + + def _update_checkpoint_index(self, metrics: Dict): + # Per default, increase by 1. This can be overwritten to customize checkpoint + # directories. + self.current_checkpoint_index += 1 + + def persist_current_checkpoint(self, checkpoint: "Checkpoint") -> "Checkpoint": + """Persists a given checkpoint to the current checkpoint path on the filesystem. + + "Current" is defined by the `current_checkpoint_index` attribute of the + storage context. + + This method copies the checkpoint files to the storage location. + It's up to the user to delete the original checkpoint files if desired. + + For example, the original directory is typically a local temp directory. + + Args: + checkpoint: The checkpoint to persist to (fs, checkpoint_fs_path). + + Returns: + Checkpoint: A Checkpoint pointing to the persisted checkpoint location. + """ + # TODO(justinvyu): Fix this cyclical import. + from ray.train._checkpoint import Checkpoint + + logger.debug( + "Copying checkpoint files to storage path:\n" + "({source_fs}, {source}) -> ({dest_fs}, {destination})".format( + source=checkpoint.path, + destination=self.checkpoint_fs_path, + source_fs=checkpoint.filesystem, + dest_fs=self.storage_filesystem, + ) + ) + + # Raise an error if the storage path is not accessible when + # attempting to upload a checkpoint from a remote worker. + # Ex: If storage_path is a local path, then a validation marker + # will only exist on the head node but not the worker nodes. + self._check_validation_file() + + self.storage_filesystem.create_dir(self.checkpoint_fs_path) + _pyarrow_fs_copy_files( + source=checkpoint.path, + destination=self.checkpoint_fs_path, + source_filesystem=checkpoint.filesystem, + destination_filesystem=self.storage_filesystem, + ) + + persisted_checkpoint = Checkpoint( + filesystem=self.storage_filesystem, + path=self.checkpoint_fs_path, + ) + logger.info(f"Checkpoint successfully created at: {persisted_checkpoint}") + return persisted_checkpoint + + def persist_artifacts(self, force: bool = False) -> None: + """Persists all artifacts within `trial_local_dir` to storage. + + This method possibly launches a background task to sync the trial dir, + depending on the `sync_period` + `sync_artifacts_on_checkpoint` + settings of `SyncConfig`. + + `(local_fs, trial_working_dir) -> (storage_filesystem, trial_fs_path)` + + Args: + force: If True, wait for a previous sync to finish, launch a new one, + and wait for that one to finish. By the end of a `force=True` call, the + latest version of the trial artifacts will be persisted. + """ + if not self.sync_config.sync_artifacts: + return + + # Skip if there are no artifacts to sync + is_empty = not any(os.scandir(self.trial_working_directory)) + if is_empty: + return + + if force: + self.syncer.wait() + self.syncer.sync_up( + local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path + ) + self.syncer.wait() + else: + self.syncer.sync_up_if_needed( + local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path + ) + + @property + def experiment_fs_path(self) -> str: + """The path on the `storage_filesystem` to the experiment directory. + + NOTE: This does not have a URI prefix anymore, since it has been stripped + by pyarrow.fs.FileSystem.from_uri already. The URI scheme information is + kept in `storage_filesystem` instead. + """ + return Path(self.storage_fs_path, self.experiment_dir_name).as_posix() + + def _get_session_path(self) -> str: + """The Ray Train/Tune session local directory used to stage files + before persisting to the storage filesystem.""" + return Path( + _get_ray_train_session_dir(), self._timestamp, self.experiment_dir_name + ).as_posix() + + @property + def experiment_driver_staging_path(self) -> str: + """The local filesystem path of the experiment directory on the driver node. + + The driver is the node where `Trainer.fit`/`Tuner.fit` is being called. + + This path is of the form: + `/tmp/ray/session_/artifacts// + /driver_artifacts` + + This should be used as the temporary staging location for files *on the driver* + before syncing them to `experiment_fs_path`. + For example, the search algorithm should dump its state to this directory. + See `trial_driver_staging_path` for writing trial-specific artifacts. + + The directory is synced to + `{storage_path}/{experiment_dir_name}` periodically. + See `_ExperimentCheckpointManager.checkpoint` for where that happens. + """ + return Path(self._get_session_path(), "driver_artifacts").as_posix() + + @property + def trial_fs_path(self) -> str: + """The trial directory path on the `storage_filesystem`. + + Raises a ValueError if `trial_dir_name` is not set beforehand. + """ + if self.trial_dir_name is None: + raise RuntimeError( + "Should not access `trial_fs_path` without setting `trial_dir_name`" + ) + return Path(self.experiment_fs_path, self.trial_dir_name).as_posix() + + @property + def trial_driver_staging_path(self) -> str: + """The local filesystem path of the trial directory on the driver. + + The driver is the node where `Trainer.fit`/`Tuner.fit` is being called. + + This path is of the form: + `/tmp/ray/session_/artifacts// + /driver_artifacts/` + + This should be used as the temporary location for files on the driver + before persisting them to `trial_fs_path`. + + For example, callbacks (e.g., JsonLoggerCallback) should write trial-specific + logfiles within this directory. + """ + if self.trial_dir_name is None: + raise RuntimeError( + "Should not access `trial_driver_staging_path` " + "without setting `trial_dir_name`" + ) + return Path(self.experiment_driver_staging_path, self.trial_dir_name).as_posix() + + @property + def trial_working_directory(self) -> str: + """The local filesystem path to trial working directory. + + This path is of the form: + `/tmp/ray/session_/artifacts// + /working_dirs/` + + Ray Train/Tune moves the remote actor's working directory to this path + by default, unless disabled by `RAY_CHDIR_TO_TRIAL_DIR` environment variable. + + Writing files to this directory allows users to persist training artifacts + if `SyncConfig(sync_artifacts=True)` is set. + """ + if self.trial_dir_name is None: + raise RuntimeError( + "Cannot access `trial_working_directory` without " + "setting `trial_dir_name`" + ) + return Path( + self._get_session_path(), "working_dirs", self.trial_dir_name + ).as_posix() + + @property + def checkpoint_fs_path(self) -> str: + """The current checkpoint directory path on the `storage_filesystem`. + + "Current" refers to the checkpoint that is currently being created/persisted. + The user of this class is responsible for setting the `current_checkpoint_index` + (e.g., incrementing when needed). + """ + return Path(self.trial_fs_path, self.checkpoint_dir_name).as_posix() + + @property + def checkpoint_dir_name(self) -> str: + """The current checkpoint directory name, based on the checkpoint index.""" + return StorageContext._make_checkpoint_dir_name(self.current_checkpoint_index) + + @staticmethod + def get_experiment_dir_name(run_obj: Union[str, Callable, Type]) -> str: + from ray.tune.experiment import Experiment + from ray.tune.utils import date_str + + run_identifier = Experiment.get_trainable_name(run_obj) + + if bool(int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0))): + dir_name = run_identifier + else: + dir_name = "{}_{}".format(run_identifier, date_str()) + return dir_name + + @staticmethod + def _make_checkpoint_dir_name(index: int): + """Get the name of the checkpoint directory, given an index.""" + return f"checkpoint_{index:06d}" diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py new file mode 100644 index 0000000000000000000000000000000000000000..4413e92452950508f9cbedc1816defd3ae7a97a3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/syncer.py @@ -0,0 +1,490 @@ +import abc +import logging +import threading +import time +import traceback +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +from ray._private.thirdparty.tabulate.tabulate import tabulate +from ray.train.constants import _DEPRECATED_VALUE +from ray.util.annotations import DeveloperAPI, PublicAPI +from ray.widgets import Template + +logger = logging.getLogger(__name__) + +# Syncing period for syncing checkpoints between nodes or to cloud. +DEFAULT_SYNC_PERIOD = 300 + +# Default sync timeout after which syncing processes are aborted +DEFAULT_SYNC_TIMEOUT = 1800 + + +@PublicAPI(stability="stable") +@dataclass +class SyncConfig: + """Configuration object for Train/Tune file syncing to `RunConfig(storage_path)`. + + In Ray Train/Tune, here is where syncing (mainly uploading) happens: + + The experiment driver (on the head node) syncs the experiment directory to storage + (which includes experiment state such as searcher state, the list of trials + and their statuses, and trial metadata). + + It's also possible to sync artifacts from the trial directory to storage + by setting `sync_artifacts=True`. + For a Ray Tune run with many trials, each trial will upload its trial directory + to storage, which includes arbitrary files that you dumped during the run. + For a Ray Train run doing distributed training, each remote worker will similarly + upload its trial directory to storage. + + See :ref:`persistent-storage-guide` for more details and examples. + + Args: + sync_period: Minimum time in seconds to wait between two sync operations. + A smaller ``sync_period`` will have the data in storage updated more often + but introduces more syncing overhead. Defaults to 5 minutes. + sync_timeout: Maximum time in seconds to wait for a sync process + to finish running. A sync operation will run for at most this long + before raising a `TimeoutError`. Defaults to 30 minutes. + sync_artifacts: [Beta] Whether or not to sync artifacts that are saved to the + trial directory (accessed via `train.get_context().get_trial_dir()`) + to the persistent storage configured via `train.RunConfig(storage_path)`. + The trial or remote worker will try to launch an artifact syncing + operation every time `train.report` happens, subject to `sync_period` + and `sync_artifacts_on_checkpoint`. + Defaults to False -- no artifacts are persisted by default. + sync_artifacts_on_checkpoint: If True, trial/worker artifacts are + forcefully synced on every reported checkpoint. + This only has an effect if `sync_artifacts` is True. + Defaults to True. + """ + + sync_period: int = DEFAULT_SYNC_PERIOD + sync_timeout: int = DEFAULT_SYNC_TIMEOUT + sync_artifacts: bool = False + sync_artifacts_on_checkpoint: bool = True + upload_dir: Optional[str] = _DEPRECATED_VALUE + syncer: Optional[Union[str, "Syncer"]] = _DEPRECATED_VALUE + sync_on_checkpoint: bool = _DEPRECATED_VALUE + + # TODO(justinvyu): [Deprecated] Remove in 2.11. + def _deprecation_warning(self, attr_name: str, extra_msg: str): + if getattr(self, attr_name) != _DEPRECATED_VALUE: + raise DeprecationWarning( + f"`SyncConfig({attr_name})` is a deprecated configuration " + "Please remove it from your `SyncConfig`. " + f"{extra_msg}" + ) + + def __post_init__(self): + for attr_name, extra_msg in [ + ( + "upload_dir", + "\nPlease specify `ray.train.RunConfig(storage_path)` instead.", + ), + ( + "syncer", + "\nPlease implement custom syncing logic with a custom " + "`pyarrow.fs.FileSystem` instead, and pass it into " + "`ray.train.RunConfig(storage_filesystem)`. " + "See here: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html#custom-storage", # noqa: E501 + ), + ("sync_on_checkpoint", ""), + ]: + self._deprecation_warning(attr_name, extra_msg) + + def _repr_html_(self) -> str: + """Generate an HTML representation of the SyncConfig.""" + return Template("scrollableTable.html.j2").render( + table=tabulate( + { + "Setting": ["Sync period", "Sync timeout"], + "Value": [self.sync_period, self.sync_timeout], + }, + tablefmt="html", + showindex=False, + headers="keys", + ), + max_height="none", + ) + + +class _BackgroundProcess: + def __init__(self, fn: Callable): + self._fn = fn + self._process = None + self._result = {} + self._start_time = float("-inf") + + @property + def is_running(self): + return self._process and self._process.is_alive() + + @property + def start_time(self): + return self._start_time + + def start(self, *args, **kwargs): + if self.is_running: + return False + + self._result = {} + + def entrypoint(): + try: + result = self._fn(*args, **kwargs) + except Exception as e: + self._result["exception"] = e + return + + self._result["result"] = result + + self._process = threading.Thread(target=entrypoint) + self._process.daemon = True + self._process.start() + self._start_time = time.time() + + def wait(self, timeout: Optional[float] = None) -> Any: + """Waits for the background process to finish running. Waits until the + background process has run for at least `timeout` seconds, counting from + the time when the process was started.""" + if not self._process: + return None + + time_remaining = None + if timeout: + elapsed = time.time() - self.start_time + time_remaining = max(timeout - elapsed, 0) + + self._process.join(timeout=time_remaining) + + if self._process.is_alive(): + self._process = None + raise TimeoutError( + f"{getattr(self._fn, '__name__', str(self._fn))} did not finish " + f"running within the timeout of {timeout} seconds." + ) + + self._process = None + + exception = self._result.get("exception") + if exception: + raise exception + + result = self._result.get("result") + + self._result = {} + return result + + +@DeveloperAPI +class Syncer(abc.ABC): + """Syncer class for synchronizing data between Ray nodes and remote (cloud) storage. + + This class handles data transfer for two cases: + + 1. Synchronizing data such as experiment state snapshots from the driver to + cloud storage. + 2. Synchronizing data such as trial checkpoints from remote trainables to + cloud storage. + + Synchronizing tasks are usually asynchronous and can be awaited using ``wait()``. + The base class implements a ``wait_or_retry()`` API that will retry a failed + sync command. + + The base class also exposes an API to only kick off syncs every ``sync_period`` + seconds. + + Args: + sync_period: The minimum time in seconds between sync operations, as + used by ``sync_up/down_if_needed``. + sync_timeout: The maximum time to wait for a sync process to finish before + issuing a new sync operation. Ex: should be used by ``wait`` if launching + asynchronous sync tasks. + """ + + def __init__( + self, + sync_period: float = DEFAULT_SYNC_PERIOD, + sync_timeout: float = DEFAULT_SYNC_TIMEOUT, + ): + self.sync_period = sync_period + self.sync_timeout = sync_timeout + self.last_sync_up_time = float("-inf") + self.last_sync_down_time = float("-inf") + + @abc.abstractmethod + def sync_up( + self, local_dir: str, remote_dir: str, exclude: Optional[List] = None + ) -> bool: + """Synchronize local directory to remote directory. + + This function can spawn an asynchronous process that can be awaited in + ``wait()``. + + Args: + local_dir: Local directory to sync from. + remote_dir: Remote directory to sync up to. This is an URI + (``protocol://remote/path``). + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + + Returns: + True if sync process has been spawned, False otherwise. + + """ + raise NotImplementedError + + @abc.abstractmethod + def sync_down( + self, remote_dir: str, local_dir: str, exclude: Optional[List] = None + ) -> bool: + """Synchronize remote directory to local directory. + + This function can spawn an asynchronous process that can be awaited in + ``wait()``. + + Args: + remote_dir: Remote directory to sync down from. This is an URI + (``protocol://remote/path``). + local_dir: Local directory to sync to. + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + + Returns: + True if sync process has been spawned, False otherwise. + + """ + raise NotImplementedError + + @abc.abstractmethod + def delete(self, remote_dir: str) -> bool: + """Delete directory on remote storage. + + This function can spawn an asynchronous process that can be awaited in + ``wait()``. + + Args: + remote_dir: Remote directory to delete. This is an URI + (``protocol://remote/path``). + + Returns: + True if sync process has been spawned, False otherwise. + + """ + raise NotImplementedError + + def retry(self): + """Retry the last sync up, sync down, or delete command. + + You should implement this method if you spawn asynchronous syncing + processes. + """ + pass + + def wait(self, timeout: Optional[float] = None): + """Wait for asynchronous sync command to finish. + + You should implement this method if you spawn asynchronous syncing + processes. This method should timeout after the asynchronous command + has run for `sync_timeout` seconds and raise a `TimeoutError`. + """ + pass + + def sync_up_if_needed( + self, local_dir: str, remote_dir: str, exclude: Optional[List] = None + ) -> bool: + """Syncs up if time since last sync up is greater than sync_period. + + Args: + local_dir: Local directory to sync from. + remote_dir: Remote directory to sync up to. This is an URI + (``protocol://remote/path``). + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + """ + now = time.time() + if now - self.last_sync_up_time >= self.sync_period: + result = self.sync_up( + local_dir=local_dir, remote_dir=remote_dir, exclude=exclude + ) + self.last_sync_up_time = now + return result + + def sync_down_if_needed( + self, remote_dir: str, local_dir: str, exclude: Optional[List] = None + ): + """Syncs down if time since last sync down is greater than sync_period. + + Args: + remote_dir: Remote directory to sync down from. This is an URI + (``protocol://remote/path``). + local_dir: Local directory to sync to. + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + """ + now = time.time() + if now - self.last_sync_down_time >= self.sync_period: + result = self.sync_down( + remote_dir=remote_dir, local_dir=local_dir, exclude=exclude + ) + self.last_sync_down_time = now + return result + + def wait_or_retry(self, max_retries: int = 2, backoff_s: int = 5): + assert max_retries > 0 + last_error_traceback = None + for i in range(max_retries + 1): + try: + self.wait() + except Exception as e: + attempts_remaining = max_retries - i + + # If we're out of retries, then save the full traceback of the last + # error and show it when raising an exception. + if attempts_remaining == 0: + last_error_traceback = traceback.format_exc() + break + + logger.error( + f"The latest sync operation failed with the following error: " + f"{repr(e)}\n" + f"Retrying {attempts_remaining} more time(s) after sleeping " + f"for {backoff_s} seconds..." + ) + time.sleep(backoff_s) + self.retry() + continue + # Succeeded! + return + raise RuntimeError( + f"Failed sync even after {max_retries} retries. " + f"The latest sync failed with the following error:\n{last_error_traceback}" + ) + + def reset(self): + self.last_sync_up_time = float("-inf") + self.last_sync_down_time = float("-inf") + + def close(self): + pass + + def _repr_html_(self) -> str: + return + + +class _BackgroundSyncer(Syncer): + """Syncer using a background process for asynchronous file transfer.""" + + def __init__( + self, + sync_period: float = DEFAULT_SYNC_PERIOD, + sync_timeout: float = DEFAULT_SYNC_TIMEOUT, + ): + super(_BackgroundSyncer, self).__init__( + sync_period=sync_period, sync_timeout=sync_timeout + ) + self._sync_process = None + self._current_cmd = None + + def _should_continue_existing_sync(self): + """Returns whether a previous sync is still running within the timeout.""" + return ( + self._sync_process + and self._sync_process.is_running + and time.time() - self._sync_process.start_time < self.sync_timeout + ) + + def _launch_sync_process(self, sync_command: Tuple[Callable, Dict]): + """Waits for the previous sync process to finish, + then launches a new process that runs the given command.""" + if self._sync_process: + try: + self.wait() + except Exception: + logger.warning( + f"Last sync command failed with the following error:\n" + f"{traceback.format_exc()}" + ) + + self._current_cmd = sync_command + self.retry() + + def sync_up( + self, local_dir: str, remote_dir: str, exclude: Optional[List] = None + ) -> bool: + if self._should_continue_existing_sync(): + logger.debug( + f"Last sync still in progress, " + f"skipping sync up of {local_dir} to {remote_dir}" + ) + return False + + sync_up_cmd = self._sync_up_command( + local_path=local_dir, uri=remote_dir, exclude=exclude + ) + self._launch_sync_process(sync_up_cmd) + + return True + + def _sync_up_command( + self, local_path: str, uri: str, exclude: Optional[List] = None + ) -> Tuple[Callable, Dict]: + raise NotImplementedError + + def sync_down( + self, remote_dir: str, local_dir: str, exclude: Optional[List] = None + ) -> bool: + if self._should_continue_existing_sync(): + logger.warning( + f"Last sync still in progress, " + f"skipping sync down of {remote_dir} to {local_dir}" + ) + return False + + sync_down_cmd = self._sync_down_command(uri=remote_dir, local_path=local_dir) + self._launch_sync_process(sync_down_cmd) + + return True + + def _sync_down_command(self, uri: str, local_path: str) -> Tuple[Callable, Dict]: + raise NotImplementedError + + def delete(self, remote_dir: str) -> bool: + if self._should_continue_existing_sync(): + logger.warning( + f"Last sync still in progress, skipping deletion of {remote_dir}" + ) + return False + + delete_cmd = self._delete_command(uri=remote_dir) + self._launch_sync_process(delete_cmd) + + return True + + def _delete_command(self, uri: str) -> Tuple[Callable, Dict]: + raise NotImplementedError + + def wait(self, timeout: Optional[float] = None): + if self._sync_process: + try: + self._sync_process.wait(timeout=timeout or self.sync_timeout) + except Exception as e: + raise e + finally: + # Regardless of whether the sync process succeeded within the timeout, + # clear the sync process so a new one can be created. + self._sync_process = None + + def retry(self): + if not self._current_cmd: + raise RuntimeError("No sync command set, cannot retry.") + cmd, kwargs = self._current_cmd + self._sync_process = _BackgroundProcess(cmd) + self._sync_process.start(**kwargs) + + def __getstate__(self): + state = self.__dict__.copy() + state["_sync_process"] = None + return state diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1e9946789ef18a33d506e4be4d811a1f3dd901d8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/utils.py @@ -0,0 +1,239 @@ +import abc +import functools +import inspect +import logging +import os +from pathlib import Path +from typing import ( + Any, + Callable, + ContextManager, + Dict, + List, + Optional, + Tuple, + TypeVar, + Union, +) + +import ray +from ray.actor import ActorHandle +from ray.air._internal.util import ( + StartTraceback, + StartTracebackWithWorkerRank, + find_free_port, +) +from ray.exceptions import RayActorError +from ray.types import ObjectRef + +T = TypeVar("T") + +logger = logging.getLogger(__name__) + + +def check_for_failure( + remote_values: List[ObjectRef], +) -> Tuple[bool, Optional[Exception]]: + """Check for actor failure when retrieving the remote values. + + Args: + remote_values: List of object references from Ray actor methods. + + Returns: + A tuple of (bool, Exception). The bool is + True if evaluating all object references is successful, False otherwise. + """ + unfinished = remote_values.copy() + + while len(unfinished) > 0: + finished, unfinished = ray.wait(unfinished) + + # If a failure occurs the ObjectRef will be marked as finished. + # Calling ray.get will expose the failure as a RayActorError. + for object_ref in finished: + # Everything in finished has either failed or completed + # successfully. + try: + ray.get(object_ref) + except RayActorError as exc: + failed_actor_rank = remote_values.index(object_ref) + logger.info(f"Worker {failed_actor_rank} has failed.") + return False, exc + except Exception as exc: + # Other (e.g. training) errors should be directly raised + failed_worker_rank = remote_values.index(object_ref) + raise StartTracebackWithWorkerRank( + worker_rank=failed_worker_rank + ) from exc + + return True, None + + +def get_address_and_port() -> Tuple[str, int]: + """Returns the IP address and a free port on this node.""" + addr = ray.util.get_node_ip_address() + port = find_free_port() + + return addr, port + + +def construct_path(path: Path, parent_path: Path) -> Path: + """Constructs a path relative to a parent. + + Args: + path: A relative or absolute path. + parent_path: A relative path or absolute path. + + Returns: An absolute path. + """ + if path.expanduser().is_absolute(): + return path.expanduser().resolve() + else: + return parent_path.joinpath(path).expanduser().resolve() + + +def update_env_vars(env_vars: Dict[str, Any]): + """Updates the environment variables on this worker process. + + Args: + env_vars: Environment variables to set. + """ + sanitized = {k: str(v) for k, v in env_vars.items()} + os.environ.update(sanitized) + + +def count_required_parameters(fn: Callable) -> int: + """Counts the number of required parameters of a function. + + NOTE: *args counts as 1 required parameter. + + Examples + -------- + + >>> def fn(a, b, /, c, *args, d=1, e=2, **kwargs): + ... pass + >>> count_required_parameters(fn) + 4 + + >>> fn = lambda: 1 + >>> count_required_parameters(fn) + 0 + + >>> def fn(config, a, b=1, c=2): + ... pass + >>> from functools import partial + >>> count_required_parameters(partial(fn, a=0)) + 1 + """ + params = inspect.signature(fn).parameters.values() + + positional_param_kinds = { + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.VAR_POSITIONAL, + } + return len( + [ + p + for p in params + if p.default == inspect.Parameter.empty and p.kind in positional_param_kinds + ] + ) + + +def construct_train_func( + train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]], + config: Optional[Dict[str, Any]], + train_func_context: ContextManager, + fn_arg_name: Optional[str] = "train_func", + discard_returns: bool = False, +) -> Callable[[], T]: + """Validates and constructs the training function to execute. + Args: + train_func: The training function to execute. + This can either take in no arguments or a ``config`` dict. + config (Optional[Dict]): Configurations to pass into + ``train_func``. If None then an empty Dict will be created. + train_func_context: Context manager for user's `train_func`, which executes + backend-specific logic before and after the training function. + fn_arg_name (Optional[str]): The name of training function to use for error + messages. + discard_returns: Whether to discard any returns from train_func or not. + Returns: + A valid training function. + Raises: + ValueError: if the input ``train_func`` is invalid. + """ + num_required_params = count_required_parameters(train_func) + + if discard_returns: + # Discard any returns from the function so that + # BackendExecutor doesn't try to deserialize them. + # Those returns are inaccesible with AIR anyway. + @functools.wraps(train_func) + def discard_return_wrapper(*args, **kwargs): + try: + train_func(*args, **kwargs) + except Exception as e: + raise StartTraceback from e + + wrapped_train_func = discard_return_wrapper + else: + wrapped_train_func = train_func + + if num_required_params > 1: + err_msg = ( + f"{fn_arg_name} should take in 0 or 1 required arguments, but it accepts " + f"{num_required_params} required arguments instead." + ) + raise ValueError(err_msg) + elif num_required_params == 1: + config = {} if config is None else config + + @functools.wraps(wrapped_train_func) + def train_fn(): + try: + with train_func_context(): + return wrapped_train_func(config) + except Exception as e: + raise StartTraceback from e + + else: # num_params == 0 + + @functools.wraps(wrapped_train_func) + def train_fn(): + try: + with train_func_context(): + return wrapped_train_func() + except Exception as e: + raise StartTraceback from e + + return train_fn + + +class Singleton(abc.ABCMeta): + """Singleton Abstract Base Class + + https://stackoverflow.com/questions/33364070/implementing + -singleton-as-metaclass-but-for-abstract-classes + """ + + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class ActorWrapper: + """Wraps an actor to provide same API as using the base class directly.""" + + def __init__(self, actor: ActorHandle): + self.actor = actor + + def __getattr__(self, item): + # The below will fail if trying to access an attribute (not a method) from the + # actor. + actor_method = getattr(self.actor, item) + return lambda *args, **kwargs: ray.get(actor_method.remote(*args, **kwargs)) diff --git a/.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py b/.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py new file mode 100644 index 0000000000000000000000000000000000000000..72da84e3c1058796af6373945d21c4a2c00fe7b4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_internal/worker_group.py @@ -0,0 +1,426 @@ +import logging +import os +import socket +from collections import defaultdict +from dataclasses import dataclass +from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union + +import ray +from ray.actor import ActorHandle +from ray.air._internal.util import exception_cause, skip_exceptions +from ray.types import ObjectRef +from ray.util.placement_group import PlacementGroup + +T = TypeVar("T") + +logger = logging.getLogger(__name__) + + +class RayTrainWorker: + """A class to execute arbitrary functions. Does not hold any state.""" + + def __execute(self, func: Callable[..., T], *args, **kwargs) -> T: + """Executes the input function and returns the output. + + Args: + func: The function to execute. + args, kwargs: The arguments to pass into func. + """ + try: + return func(*args, **kwargs) + except Exception as e: + skipped = skip_exceptions(e) + raise skipped from exception_cause(skipped) + + +@dataclass +class WorkerMetadata: + """Metadata for each worker/actor. + + This information is expected to stay the same throughout the lifetime of + actor. + + Args: + node_id: ID of the node this worker is on. + node_ip: IP address of the node this worker is on. + hostname: Hostname that this worker is on. + resource_ids: Map of accelerator resources + ("GPU", "neuron_cores", ..) to their IDs. + pid: Process ID of this worker. + """ + + node_id: str + node_ip: str + hostname: str + resource_ids: Dict[str, List[str]] + pid: int + + +@dataclass +class Worker: + """Class representing a Worker.""" + + actor: ActorHandle + metadata: WorkerMetadata + + +def create_executable_class(executable_cls: Optional[Type] = None) -> Type: + """Create the executable class to use as the Ray actors.""" + if not executable_cls: + return RayTrainWorker + elif issubclass(executable_cls, RayTrainWorker): + return executable_cls + else: + + class _WrappedExecutable(executable_cls, RayTrainWorker): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + return _WrappedExecutable + + +def construct_metadata() -> WorkerMetadata: + """Creates metadata for this worker. + + This function is expected to be run on the actor. + """ + node_id = ray.get_runtime_context().get_node_id() + node_ip = ray.util.get_node_ip_address() + hostname = socket.gethostname() + accelerator_ids = ray.get_runtime_context().get_accelerator_ids() + pid = os.getpid() + + return WorkerMetadata( + node_id=node_id, + node_ip=node_ip, + hostname=hostname, + resource_ids=accelerator_ids, + pid=pid, + ) + + +class WorkerGroup: + """Group of Ray Actors that can execute arbitrary functions. + + ``WorkerGroup`` launches Ray actors according to the given + specification. It can then execute arbitrary Python functions in each of + these workers. + + If not enough resources are available to launch the actors, the Ray + cluster will automatically scale up if autoscaling is enabled. + + Args: + num_workers: The number of workers (Ray actors) to launch. + Defaults to 1. + resources_per_worker (Optional[Dict[str, float]]): + Dictionary specifying the resources that will be + requested for each worker. Defaults to {"CPU": 1}. + actor_cls (Optional[Type]): If specified use this class as the + remote actors. + remote_cls_args, remote_cls_kwargs: If ``remote_cls`` is provided, + these args will be used for the worker initialization. + placement_group (PlacementGroup|str): The placement group that workers + should be created in. Defaults to "default" which will inherit the + parent placement group (if child tasks should be captured). + + + Example: + + .. code_block:: python + + worker_group = WorkerGroup(num_workers=2) + output = worker_group.execute(lambda: 1) + assert len(output) == 2 + assert all(o == 1 for o in output) + """ + + def __init__( + self, + num_workers: int = 1, + resources_per_worker: Optional[Dict[str, float]] = None, + actor_cls: Type = None, + actor_cls_args: Optional[Tuple] = None, + actor_cls_kwargs: Optional[Dict] = None, + placement_group: Union[PlacementGroup, str] = "default", + ): + if resources_per_worker is None: + resources_per_worker = {"CPU": 1} + else: + resources_per_worker = resources_per_worker.copy() + + if num_workers <= 0: + raise ValueError( + "The provided `num_workers` must be greater " + f"than 0. Received num_workers={num_workers} " + f"instead." + ) + + if any(v < 0 for v in resources_per_worker.values()): + raise ValueError( + "The number of resources per worker must not be negative. " + f"Received resources_per_worker={resources_per_worker}." + ) + + if (actor_cls_args or actor_cls_kwargs) and not actor_cls: + raise ValueError( + "`actor_cls_args` or `actor_class_kwargs` are " + "passed in but no `actor_cls` is passed in." + ) + + self.num_workers = num_workers + self.num_cpus_per_worker = resources_per_worker.pop("CPU", 0) + self.num_gpus_per_worker = resources_per_worker.pop("GPU", 0) + self.memory_per_worker = resources_per_worker.pop("memory", 0) + self.workers = [] + self._base_cls = create_executable_class(actor_cls) + assert issubclass(self._base_cls, RayTrainWorker) + + self._actor_cls_args = actor_cls_args or [] + self._actor_cls_kwargs = actor_cls_kwargs or {} + + self._placement_group = placement_group + + # TODO(matt): Validate resources. Fast-fail if it is impossible to + # handle the request, rather than hang indefinitely. + self._remote_cls = ray.remote( + num_cpus=self.num_cpus_per_worker, + num_gpus=self.num_gpus_per_worker, + memory=self.memory_per_worker, + resources=resources_per_worker, + )(self._base_cls) + self.start() + + def start(self): + """Starts all the workers in this worker group.""" + if self.workers and len(self.workers) > 0: + raise RuntimeError( + "The workers have already been started. " + "Please call `shutdown` first if you want to " + "restart them." + ) + + logger.debug(f"Starting {self.num_workers} workers.") + self.add_workers(self.num_workers) + logger.debug(f"{len(self.workers)} workers have successfully started.") + + def shutdown(self, patience_s: float = 5): + """Shutdown all the workers in this worker group. + + Args: + patience_s: Attempt a graceful shutdown + of the workers for this many seconds. Fallback to force kill + if graceful shutdown is not complete after this time. If + this is less than or equal to 0, immediately force kill all + workers. + """ + logger.debug(f"Shutting down {len(self.workers)} workers.") + if patience_s <= 0: + for worker in self.workers: + ray.kill(worker.actor) + else: + done_refs = [w.actor.__ray_terminate__.remote() for w in self.workers] + # Wait for actors to die gracefully. + done, not_done = ray.wait(done_refs, timeout=patience_s) + if not_done: + logger.debug("Graceful termination failed. Falling back to force kill.") + # If all actors are not able to die gracefully, then kill them. + for worker in self.workers: + ray.kill(worker.actor) + + logger.debug("Shutdown successful.") + self.workers = [] + + def execute_async(self, func: Callable[..., T], *args, **kwargs) -> List[ObjectRef]: + """Execute ``func`` on each worker and return the futures. + + Args: + func: A function to call on each worker. + args, kwargs: Passed directly into func. + + Returns: + (List[ObjectRef]) A list of ``ObjectRef`` representing the + output of ``func`` from each worker. The order is the same + as ``self.workers``. + + """ + if len(self.workers) <= 0: + raise RuntimeError( + "There are no active workers. This worker " + "group has most likely been shut down. Please" + "create a new WorkerGroup or restart this one." + ) + + return [ + w.actor._RayTrainWorker__execute.options( + name=f"_RayTrainWorker__execute.{func.__name__}" + ).remote(func, *args, **kwargs) + for w in self.workers + ] + + def execute(self, func: Callable[..., T], *args, **kwargs) -> List[T]: + """Execute ``func`` on each worker and return the outputs of ``func``. + + Args: + func: A function to call on each worker. + args, kwargs: Passed directly into func. + + Returns: + (List[T]) A list containing the output of ``func`` from each + worker. The order is the same as ``self.workers``. + + """ + return ray.get(self.execute_async(func, *args, **kwargs)) + + def execute_single_async( + self, worker_index: int, func: Callable[..., T], *args, **kwargs + ) -> ObjectRef: + """Execute ``func`` on worker ``worker_index`` and return futures. + + Args: + worker_index: The index to execute func on. + func: A function to call on the first worker. + args, kwargs: Passed directly into func. + + Returns: + (ObjectRef) An ObjectRef representing the output of func. + + """ + if worker_index >= len(self.workers): + raise ValueError( + f"The provided worker_index {worker_index} is " + f"not valid for {self.num_workers} workers." + ) + return ( + self.workers[worker_index] + .actor._RayTrainWorker__execute.options( + name=f"_RayTrainWorker__execute.{func.__name__}" + ) + .remote(func, *args, **kwargs) + ) + + def execute_single( + self, worker_index: int, func: Callable[..., T], *args, **kwargs + ) -> T: + """Execute ``func`` on worker with index ``worker_index``. + + Args: + worker_index: The index to execute func on. + func: A function to call on the first worker. + args, kwargs: Passed directly into func. + + Returns: + (T) The output of func. + + """ + + return ray.get(self.execute_single_async(worker_index, func, *args, **kwargs)) + + def remove_workers(self, worker_indexes: List[int]): + """Removes the workers with the specified indexes. + + The removed workers will go out of scope and their actor processes + will be terminated. + + Args: + worker_indexes (List[int]): The indexes of the workers to remove. + """ + new_workers = [] + for i in range(len(self.workers)): + if i not in worker_indexes: + new_workers.append(self.workers[i]) + self.workers = new_workers + + def add_workers(self, num_workers: int): + """Adds ``num_workers`` to this WorkerGroup. + + Note: Adding workers when the cluster/placement group is at capacity + may lead to undefined hanging behavior. If you are attempting to + replace existing workers in the WorkerGroup, remove_workers() should + be called first. + + Args: + num_workers: The number of workers to add. + """ + new_actors = [] + new_actor_metadata = [] + for _ in range(num_workers): + actor = self._remote_cls.options( + placement_group=self._placement_group + ).remote(*self._actor_cls_args, **self._actor_cls_kwargs) + new_actors.append(actor) + new_actor_metadata.append( + actor._RayTrainWorker__execute.options( + name="_RayTrainWorker__execute.construct_metadata" + ).remote(construct_metadata) + ) + + # Get metadata from all actors. + metadata = ray.get(new_actor_metadata) + + for i in range(len(new_actors)): + self.workers.append(Worker(actor=new_actors[i], metadata=metadata[i])) + + def sort_workers_by_node_id_and_gpu_id(self, _first_node_id: Optional[str] = None): + """Reorder the workers by their node id and the lowest GPU id. + + This is useful for collocating workers on the same node. + + Example: + Given workers with the following attributes: + worker_0: node_id=1, gpu_ids=[1] + worker_1: node_id=0, gpu_ids=[0] + worker_2: node_id=1, gpu_ids=[0] + worker_3: node_id=0, gpu_ids=[1] + + The function will perform the following steps: + 1. Group by node ID: + node_id=0: worker_1, worker_3 + node_id=1: worker_0, worker_2 + + 2. Sort each group by GPU ID: + node_id=0: worker_1 (gpu_id=0), worker_3 (gpu_id=1) + node_id=1: worker_2 (gpu_id=0), worker_0 (gpu_id=1) + + Resulting in the order: [worker_1, worker_3, worker_2, worker_0] + + Args: + _first_node_id: The first ID to group by. + Set this to the node ID of the trainer coordinator to ensure that the + rank 0 worker is on the same node, allowing additional resources to + be specified for rank 0 workers via + `ScalingConfig(trainer_resources=)`. + """ + node_id_to_workers = defaultdict(list) + + if _first_node_id is not None: + node_id_to_workers[_first_node_id] = [] + + for worker in self.workers: + node_id_to_workers[worker.metadata.node_id].append(worker) + + # Sort workers on the same node by the lowest GPU id + # More details: https://github.com/ray-project/ray/issues/40803 + def get_lowest_gpu_id(worker) -> int: + gpu_ids = worker.metadata.resource_ids.get("GPU", []) + # If there are no GPU IDs, return 0 as a default + if not gpu_ids: + return 0 + + # Attempt to convert GPU IDs to integers and find the minimum ID. + # Fallback to return the minimum string-based ID + try: + return min(int(gpu_id) for gpu_id in gpu_ids) + except ValueError: + return min(gpu_ids) + + for node_id in node_id_to_workers: + node_id_to_workers[node_id].sort(key=get_lowest_gpu_id) + + sorted_workers = [] + for workers in node_id_to_workers.values(): + sorted_workers.extend(workers) + + self.workers = sorted_workers + + def __len__(self): + return len(self.workers) diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..73f9e7ab585a1d577f589e9c795a5560d830169a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/horovod/__init__.py @@ -0,0 +1,22 @@ +# isort: off +try: + import horovod # noqa: F401 +except ModuleNotFoundError: + raise ModuleNotFoundError( + "Horovod isn't installed. To install Horovod with PyTorch support, run 'pip " + "install 'horovod[pytorch]''. To install Horovod with TensorFlow support, " + "run 'pip install 'horovod[tensorflow]''." + ) +# isort: on + +from ray.train.horovod.config import HorovodConfig +from ray.train.horovod.horovod_trainer import HorovodTrainer +from ray.train.v2._internal.constants import is_v2_enabled + +if is_v2_enabled(): + from ray.train.v2.horovod.horovod_trainer import HorovodTrainer # noqa: F811 + +__all__ = ["HorovodConfig", "HorovodTrainer"] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c55fe6bb561e328104dba704d1067e22d44b0ca0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2f1c19cab0bcd7da37aec40fc2e37851fa2217c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcbf7251d4b2b71855ad473b0b95f782530ce783 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/horovod/__pycache__/horovod_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/config.py b/.venv/lib/python3.11/site-packages/ray/train/horovod/config.py new file mode 100644 index 0000000000000000000000000000000000000000..acd56091d3a4d7bc505bad007c0d394d103269ae --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/horovod/config.py @@ -0,0 +1,159 @@ +import os +from dataclasses import dataclass +from typing import Optional, Set + +from horovod.ray.runner import Coordinator +from horovod.ray.utils import detect_nics, nics_to_env_var +from horovod.runner.common.util import secret, timeout + +import ray +from ray.train._internal.utils import update_env_vars +from ray.train._internal.worker_group import Worker, WorkerGroup +from ray.train.backend import Backend, BackendConfig +from ray.util import PublicAPI + + +@PublicAPI(stability="beta") +@dataclass +class HorovodConfig(BackendConfig): + """Configurations for Horovod setup. + + See https://github.com/horovod/horovod/blob/master/horovod/runner/common/util/settings.py # noqa: E501 + + Args: + nics (Optional[Set[str]): Network interfaces that can be used for + communication. + verbose: Horovod logging verbosity. + key (Optional[str]): Secret used for communication between workers. + ssh_port (Optional[int]): Port for SSH server running on worker nodes. + ssh_identity_file (Optional[str]): Path to the identity file to + ssh into different hosts on the cluster. + ssh_str (Optional[str]): CAUTION WHEN USING THIS. Private key + file contents. Writes the private key to ssh_identity_file. + timeout_s: Timeout parameter for Gloo rendezvous. + placement_group_timeout_s: Timeout parameter for Ray + Placement Group creation. Currently unused. + """ + + nics: Optional[Set[str]] = None + verbose: int = 1 + key: Optional[str] = None + ssh_port: Optional[int] = None + ssh_identity_file: Optional[str] = None + ssh_str: Optional[str] = None + timeout_s: int = 300 + placement_group_timeout_s: int = 100 + + @property + def start_timeout(self): + return timeout.Timeout( + self.timeout_s, + message="Timed out waiting for {activity}. Please " + "check connectivity between servers. You " + "may need to increase the --start-timeout " + "parameter if you have too many servers.", + ) + + def __post_init__(self): + if self.ssh_str and not os.path.exists(self.ssh_identity_file): + with open(self.ssh_identity_file, "w") as f: + os.chmod(self.ssh_identity_file, 0o600) + f.write(self.ssh_str) + + if self.key is None: + self.key = secret.make_secret_key() + + @property + def backend_cls(self): + return _HorovodBackend + + +class _HorovodBackend(Backend): + share_cuda_visible_devices: bool = True + + def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig): + # TODO(matt): Implement placement group strategies in BackendExecutor. + + # Initialize workers with Horovod environment variables + setup_futures = [] + for rank in range(len(worker_group)): + worker_node_id = worker_group.workers[rank].metadata.node_id + setup_futures.append( + worker_group.execute_single_async( + rank, + _init_env_vars, + rank, + len(worker_group), + worker_node_id, + ) + ) + ray.get(setup_futures) + + # Use Horovod Ray Coordinator + # backend_config as settings + self.coordinator = Coordinator(backend_config) + + # Get all the hostnames of all workers + node_ids = [w.metadata.node_id for w in worker_group.workers] + hostnames = [w.metadata.hostname for w in worker_group.workers] + # Register each hostname to the coordinator. assumes the hostname + # ordering is the same. + for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): + self.coordinator.register(hostname, node_id, rank) + all_info = self.coordinator.finalize_registration() + + setup_futures = [] + for rank, local_cross_env_var in all_info.items(): + setup_futures.append( + worker_group.execute_single_async( + rank, update_env_vars, local_cross_env_var + ) + ) + ray.get(setup_futures) + + coordinator_envs = self.coordinator.establish_rendezvous() + + # Get one worker from each host/node. + node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)] + node_workers = [ + _HorovodWorkerWrapper(worker_group.workers[worker_index]) + for worker_index in node_worker_indexes + ] + assert len(node_workers) == len(self.coordinator.hostnames) + + nics = detect_nics( + backend_config, + all_host_names=list(self.coordinator.hostnames), + node_workers=node_workers, + ) + coordinator_envs.update(nics_to_env_var(nics)) + + worker_group.execute(update_env_vars, coordinator_envs) + + +def _init_env_vars(world_rank: int, world_size: int, node_id: str): + """Initialize Horovod environment variables.""" + os.environ["HOROVOD_HOSTNAME"] = node_id + os.environ["HOROVOD_RANK"] = str(world_rank) + os.environ["HOROVOD_SIZE"] = str(world_size) + + +# TODO(tgaddair): temporary workaround for Horovod's worker discovery logic, +# which requires passing in an extra parameter as part of the RayExecutor +# API. This will be removed in the future as we migrate more of the +# RayExecutor utils into Ray Train. +# See: https://github.com/horovod/horovod/blob/v0.23.0/horovod/ray/driver_service.py#L9 # noqa: E501 +@dataclass +class _HorovodWorkerWrapper: + w: Worker + + @property + def execute(self): + w = self.w + + class ExecuteHandle: + def remote(self, func, *args, **kwargs): + _ = None + return w.actor._RayTrainWorker__execute.remote(func, _, *args, **kwargs) + + return ExecuteHandle() diff --git a/.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..27d97572d7935814d3a18747d2a2b22d5c09c179 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/horovod/horovod_trainer.py @@ -0,0 +1,202 @@ +from typing import Any, Callable, Dict, Optional, Union + +from ray.air.config import RunConfig, ScalingConfig +from ray.train import Checkpoint, DataConfig +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.train.horovod.config import HorovodConfig +from ray.train.trainer import GenDataset +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="beta") +class HorovodTrainer(DataParallelTrainer): + """A Trainer for data parallel Horovod training. + + This Trainer runs the function ``train_loop_per_worker`` on multiple Ray + Actors. These actors already have the necessary Horovod setup already + configured for distributed Horovod training. + + The ``train_loop_per_worker`` function is expected to take in either 0 or 1 + arguments: + + .. testcode:: + + def train_loop_per_worker(): + ... + + .. testcode:: + + def train_loop_per_worker(config: Dict): + ... + + If ``train_loop_per_worker`` accepts an argument, then + ``train_loop_config`` will be passed in as the argument. This is useful if you + want to tune the values in ``train_loop_config`` as hyperparameters. + + If the ``datasets`` dict contains a training dataset (denoted by + the "train" key), then it will be split into multiple dataset + shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside + ``train_loop_per_worker``. All the other datasets will not be split and + ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. + + Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray Train loop methods `. + + .. testcode:: + + from ray import train + + def train_loop_per_worker(): + # Report intermediate results for callbacks or logging and + # checkpoint data. + train.report(...) + + # Returns dict of last saved checkpoint. + train.get_checkpoint() + + # Returns the Dataset shard for the given key. + train.get_dataset_shard("my_dataset") + + # Returns the total number of workers executing training. + train.get_context().get_world_size() + + # Returns the rank of this worker. + train.get_context().get_world_rank() + + # Returns the rank of the worker on the current node. + train.get_context().get_local_rank() + + Any returns from the ``train_loop_per_worker`` will be discarded and not + used or persisted anywhere. + + You could use ``TensorflowPredictor`` or ``TorchPredictor`` in conjunction with + HorovodTrainer. You must save the model under the "model" kwarg in the + ``Checkpoint`` passed to ``train.report()``, so that it can be used by + corresponding predictors. + + Example: + + + .. testcode:: + :skipif: True + + import os + import tempfile + + import ray + import horovod.torch as hvd + import torch + import torch.nn as nn + + from ray import train + import ray.train.torch # Need this to use `train.torch.get_device()` + from ray.train import Checkpoint, ScalingConfig + from ray.train.horovod import HorovodTrainer + + # If using GPUs, set this to True. + use_gpu = False + + input_size = 1 + layer_size = 15 + output_size = 1 + num_epochs = 3 + + class NeuralNetwork(nn.Module): + def __init__(self): + super(NeuralNetwork, self).__init__() + self.layer1 = nn.Linear(input_size, layer_size) + self.relu = nn.ReLU() + self.layer2 = nn.Linear(layer_size, output_size) + def forward(self, input): + return self.layer2(self.relu(self.layer1(input))) + + def train_loop_per_worker(): + hvd.init() + dataset_shard = train.get_dataset_shard("train") + model = NeuralNetwork() + device = train.torch.get_device() + model.to(device) + loss_fn = nn.MSELoss() + lr_scaler = 1 + optimizer = torch.optim.SGD(model.parameters(), lr=0.1 * lr_scaler) + # Horovod: wrap optimizer with DistributedOptimizer. + optimizer = hvd.DistributedOptimizer( + optimizer, + named_parameters=model.named_parameters(), + op=hvd.Average, + ) + for epoch in range(num_epochs): + model.train() + for batch in dataset_shard.iter_torch_batches( + batch_size=32, dtypes=torch.float + ): + inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"] + outputs = model(inputs) + loss = loss_fn(outputs, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + print(f"epoch: {epoch}, loss: {loss.item()}") + + # Save a model checkpoint at the end of each epoch + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + ckpt_path = os.path.join(temp_checkpoint_dir, "model.pt") + torch.save(model.state_dict(), ckpt_path) + train.report( + {"loss": loss.item(), "epoch": epoch}, + checkpoint=Checkpoint.from_directory(temp_checkpoint_dir), + ) + + train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) + scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) + trainer = HorovodTrainer( + train_loop_per_worker=train_loop_per_worker, + scaling_config=scaling_config, + datasets={"train": train_dataset}, + ) + result = trainer.fit() + + Args: + train_loop_per_worker: The training function to execute. + This can either take in no arguments or a ``config`` dict. + train_loop_config: Configurations to pass into + ``train_loop_per_worker`` if it accepts an argument. + horovod_config: Configuration for setting up the Horovod backend. + If set to None, use the default configuration. This replaces the + ``backend_config`` arg of ``DataParallelTrainer``. + scaling_config: Configuration for how to scale data parallel training. + dataset_config: Configuration for dataset ingest. + run_config: Configuration for the execution of the training run. + datasets: Any Datasets to use for training. Use + the key "train" to denote which dataset is the training + dataset. + resume_from_checkpoint: A checkpoint to resume training from. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + horovod_config: Optional[HorovodConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + dataset_config: Optional[DataConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + super().__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=horovod_config or HorovodConfig(), + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c8e413a1030837512e86386b76f1c12ce21b2a92 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightning/__init__.py @@ -0,0 +1,39 @@ +# isort: off +try: + import lightning # noqa: F401 +except ModuleNotFoundError: + try: + import pytorch_lightning # noqa: F401 + except ModuleNotFoundError: + raise ModuleNotFoundError( + "PyTorch Lightning isn't installed. To install PyTorch Lightning, " + "please run 'pip install lightning'" + ) +# isort: on + +from ray.train.lightning._lightning_utils import ( + RayDDPStrategy, + RayDeepSpeedStrategy, + RayFSDPStrategy, + RayLightningEnvironment, + RayTrainReportCallback, + prepare_trainer, +) +from ray.train.v2._internal.constants import is_v2_enabled + +if is_v2_enabled(): + from ray.train.v2.lightning.lightning_utils import ( # noqa: F811 + RayTrainReportCallback, + ) + +__all__ = [ + "prepare_trainer", + "RayDDPStrategy", + "RayFSDPStrategy", + "RayDeepSpeedStrategy", + "RayLightningEnvironment", + "RayTrainReportCallback", +] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc0c6e0201df800a354fdf824cda9ba7b94ea821 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/_lightning_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/_lightning_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2397b6a8179b4c3cc4c01a080a333f7e4ab0ddd7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightning/__pycache__/_lightning_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py b/.venv/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ba42fe12f4ba36db70620a7aeba310a6b0ccc66f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py @@ -0,0 +1,295 @@ +import logging +import os +import shutil +import tempfile +from pathlib import Path +from typing import Any, Dict + +import torch +from packaging.version import Version + +import ray +from ray import train +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray.train import Checkpoint +from ray.util import PublicAPI + + +def import_lightning(): # noqa: F402 + try: + import lightning.pytorch as pl + except ModuleNotFoundError: + import pytorch_lightning as pl + return pl + + +pl = import_lightning() + +_LIGHTNING_GREATER_EQUAL_2_0 = Version(pl.__version__) >= Version("2.0.0") +_LIGHTNING_LESS_THAN_2_1 = Version(pl.__version__) < Version("2.1.0") +_TORCH_GREATER_EQUAL_1_12 = Version(torch.__version__) >= Version("1.12.0") +_TORCH_FSDP_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.distributed.is_available() + +try: + from lightning.pytorch.plugins.environments import LightningEnvironment +except ModuleNotFoundError: + from pytorch_lightning.plugins.environments import LightningEnvironment + +if _LIGHTNING_GREATER_EQUAL_2_0: + FSDPStrategy = pl.strategies.FSDPStrategy +else: + FSDPStrategy = pl.strategies.DDPFullyShardedStrategy + +if _TORCH_FSDP_AVAILABLE: + from torch.distributed.fsdp import ( + FullStateDictConfig, + FullyShardedDataParallel, + StateDictType, + ) + + +logger = logging.getLogger(__name__) + +LIGHTNING_REPORT_STAGE_KEY = "_report_on" + + +@PublicAPI(stability="beta") +class RayDDPStrategy(pl.strategies.DDPStrategy): + """Subclass of DDPStrategy to ensure compatibility with Ray orchestration. + + For a full list of initialization arguments, please refer to: + https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.DDPStrategy.html + + Note that `process_group_backend`, `timeout`, and `start_method` are disabled here, + please specify these arguments in :class:`~ray.train.torch.TorchConfig` instead. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYDDPSTRATEGY, "1") + + @property + def root_device(self) -> torch.device: + return ray.train.torch.get_device() + + @property + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + return dict( + num_replicas=self.world_size, + rank=self.global_rank, + ) + + +@PublicAPI(stability="beta") +class RayFSDPStrategy(FSDPStrategy): # noqa: F821 + """Subclass of FSDPStrategy to ensure compatibility with Ray orchestration. + + For a full list of initialization arguments, please refer to: + https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.FSDPStrategy.html + + .. note:: + It is recommended to upgrade `lightning>=2.1` or above when using FSDP + with Lightning, since Lightning starts to natively support `state_dict_type`, + `sharding_strategy`, `auto_wrap_policy` and other FSDP configurations from 2.1. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYFSDPSTRATEGY, "1") + + @property + def root_device(self) -> torch.device: + return ray.train.torch.get_device() + + @property + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + return dict( + num_replicas=self.world_size, + rank=self.global_rank, + ) + + def lightning_module_state_dict(self) -> Dict[str, Any]: + """Gathers the full state dict to rank 0 on CPU. + + FSDP checkpointing is broken in Lightning 2.0.x. This subclass patches the + behavior to perform a full state dict checkpointing, gathering the checkpoint + shards on rank 0 CPU. Upgrade to `lightning>=2.1` to do sharded state dict + checkpointing. + + See the note in the class docstring for more details. + """ + + assert self.model is not None, "Failed to get the state dict for a None model!" + + if ( + _TORCH_FSDP_AVAILABLE + and _LIGHTNING_GREATER_EQUAL_2_0 + and _LIGHTNING_LESS_THAN_2_1 + ): + with FullyShardedDataParallel.state_dict_type( + module=self.model, + state_dict_type=StateDictType.FULL_STATE_DICT, + state_dict_config=FullStateDictConfig( + offload_to_cpu=True, rank0_only=True + ), + ): + state_dict = self.model.state_dict() + + ckpt_state_dict = {} + prefix_len = len("_forward_module.") + for k, v in state_dict.items(): + if k.startswith("_forward_module."): + non_prefixed_key = k[prefix_len:] + ckpt_state_dict[non_prefixed_key] = v + else: + ckpt_state_dict[k] = v + return ckpt_state_dict + else: + # Otherwise Lightning uses Fairscale FSDP, no need to unshard by ourself. + return super().lightning_module_state_dict() + + +@PublicAPI(stability="beta") +class RayDeepSpeedStrategy(pl.strategies.DeepSpeedStrategy): + """Subclass of DeepSpeedStrategy to ensure compatibility with Ray orchestration. + + For a full list of initialization arguments, please refer to: + https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.DeepSpeedStrategy.html + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYDEEPSPEEDSTRATEGY, "1") + + @property + def root_device(self) -> torch.device: + return ray.train.torch.get_device() + + @property + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + return dict( + num_replicas=self.world_size, + rank=self.global_rank, + ) + + +@PublicAPI(stability="beta") +class RayLightningEnvironment(LightningEnvironment): # noqa: F821 + """Setup Lightning DDP training environment for Ray cluster.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYLIGHTNINGENVIRONMENT, "1") + + def world_size(self) -> int: + return train.get_context().get_world_size() + + def global_rank(self) -> int: + return train.get_context().get_world_rank() + + def local_rank(self) -> int: + return train.get_context().get_local_rank() + + def node_rank(self) -> int: + return train.get_context().get_node_rank() + + def set_world_size(self, size: int) -> None: + # Disable it since `world_size()` directly returns data from Train context. + pass + + def set_global_rank(self, rank: int) -> None: + # Disable it since `global_rank()` directly returns data from Train. + pass + + def teardown(self): + pass + + +@PublicAPI(stability="beta") +def prepare_trainer(trainer: pl.Trainer) -> pl.Trainer: + """Prepare the PyTorch Lightning Trainer for distributed execution.""" + + # Check strategy class + valid_strategy_class = [RayDDPStrategy, RayFSDPStrategy, RayDeepSpeedStrategy] + + if not any(isinstance(trainer.strategy, cls) for cls in valid_strategy_class): + raise RuntimeError( + f"Invalid strategy class: {type(trainer.strategy)}. To use " + "PyTorch Lightning with Ray, the strategy object should be one of " + f"{[cls.__name__ for cls in valid_strategy_class]} class " + "or its subclass." + ) + + # Check cluster environment + cluster_environment = getattr(trainer.strategy, "cluster_environment", None) + if cluster_environment and not isinstance( + cluster_environment, RayLightningEnvironment + ): + raise RuntimeError( + "Invalid cluster environment plugin. The expected class is" + "`ray.train.lightning.RayLightningEnvironment` " + f"but got {type(cluster_environment)}!" + ) + + record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_PREPARE_TRAINER, "1") + return trainer + + +@PublicAPI(stability="beta") +class RayTrainReportCallback(pl.callbacks.Callback): + """A simple callback that reports checkpoints to Ray on train epoch end. + + This callback is a subclass of `lightning.pytorch.callbacks.Callback + `_. + + It fetches the latest `trainer.callback_metrics` and reports together with + the checkpoint on each training epoch end. + + Checkpoints will be saved in the following structure:: + + checkpoint_00000*/ Ray Train Checkpoint + └─ checkpoint.ckpt PyTorch Lightning Checkpoint + + For customized reporting and checkpointing logic, implement your own + `lightning.pytorch.callbacks.Callback` following this user + guide: :ref:`Saving and Loading Checkpoints `. + """ + + CHECKPOINT_NAME = "checkpoint.ckpt" + + def __init__(self) -> None: + super().__init__() + self.trial_name = train.get_context().get_trial_name() + self.local_rank = train.get_context().get_local_rank() + self.tmpdir_prefix = Path(tempfile.gettempdir(), self.trial_name).as_posix() + if os.path.isdir(self.tmpdir_prefix) and self.local_rank == 0: + shutil.rmtree(self.tmpdir_prefix) + + record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYTRAINREPORTCALLBACK, "1") + + def on_train_epoch_end(self, trainer, pl_module) -> None: + # Creates a checkpoint dir with fixed name + tmpdir = Path(self.tmpdir_prefix, str(trainer.current_epoch)).as_posix() + os.makedirs(tmpdir, exist_ok=True) + + # Fetch metrics + metrics = trainer.callback_metrics + metrics = {k: v.item() for k, v in metrics.items()} + + # (Optional) Add customized metrics + metrics["epoch"] = trainer.current_epoch + metrics["step"] = trainer.global_step + + # Save checkpoint to local + ckpt_path = Path(tmpdir, self.CHECKPOINT_NAME).as_posix() + trainer.save_checkpoint(ckpt_path, weights_only=False) + + # Report to train session + checkpoint = Checkpoint.from_directory(tmpdir) + train.report(metrics=metrics, checkpoint=checkpoint) + + # Add a barrier to ensure all workers finished reporting here + trainer.strategy.barrier() + + if self.local_rank == 0: + shutil.rmtree(tmpdir) diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db989336afd15a2d2147d45f9b6685f3e823598c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/__init__.py @@ -0,0 +1,56 @@ +# isort: off +try: + import torch # noqa: F401 +except ModuleNotFoundError: + raise ModuleNotFoundError( + "PyTorch isn't installed. To install PyTorch, run 'pip install torch'" + ) +# isort: on + +from ray.train.torch.config import TorchConfig +from ray.train.torch.torch_checkpoint import TorchCheckpoint +from ray.train.torch.torch_detection_predictor import TorchDetectionPredictor +from ray.train.torch.torch_predictor import TorchPredictor +from ray.train.torch.torch_trainer import TorchTrainer +from ray.train.torch.train_loop_utils import ( + accelerate, + backward, + enable_reproducibility, + get_device, + get_devices, + prepare_data_loader, + prepare_model, + prepare_optimizer, +) +from ray.train.v2._internal.constants import is_v2_enabled + +if is_v2_enabled(): + from ray.train.v2.torch.torch_trainer import TorchTrainer # noqa: F811 + from ray.train.v2.torch.train_loop_utils import ( # noqa: F811 + accelerate, + backward, + enable_reproducibility, + prepare_data_loader, + prepare_model, + prepare_optimizer, + ) + + +__all__ = [ + "TorchTrainer", + "TorchCheckpoint", + "TorchConfig", + "accelerate", + "get_device", + "get_devices", + "prepare_model", + "prepare_optimizer", + "prepare_data_loader", + "backward", + "enable_reproducibility", + "TorchPredictor", + "TorchDetectionPredictor", +] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d891e2dffd497d7d9fd0d8b6b0bfc71b01cd16e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd78d8f68b7ca8289824aee66cf8b41ed5add595 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffda8831838ce5559b2c84d1e1a323f47294b203 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_detection_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_detection_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac54b56fb43d2015fd69d8ee1c372ce3587d6c92 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_detection_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d334e9153056cde09ca4870a7b611704a137696 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78a06db6591e2f6169a947f25a6f179e6f5111dc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/torch_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/train_loop_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/train_loop_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f6a266fd2d31ca9c6fa04b1113af3b3530d98dd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/__pycache__/train_loop_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/config.py b/.venv/lib/python3.11/site-packages/ray/train/torch/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a0ecc61e3b874e1624b4e5ccdd3ae0e77a176d68 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/config.py @@ -0,0 +1,213 @@ +import logging +import os +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional + +import torch +import torch.distributed as dist +from packaging.version import Version + +import ray +from ray.air._internal.device_manager import register_custom_torch_dist_backend +from ray.train._internal.utils import get_address_and_port +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig +from ray.util import PublicAPI + +logger = logging.getLogger(__name__) + + +class TorchConfigContextManager: + def __enter__(self): + # Set default cuda device + if torch.cuda.is_available(): + device = ray.train.torch.get_device() + if device.type == "cuda": + torch.cuda.set_device(device) + + def __exit__(self, type, value, traceback): + # Propagate exceptions if any + return False + + +@PublicAPI(stability="stable") +@dataclass +class TorchConfig(BackendConfig): + """Configuration for torch process group setup. + + See https://pytorch.org/docs/stable/distributed.html for more info. + + Args: + backend: The backend to use for training. + See ``torch.distributed.init_process_group`` for more info and + valid values. + If set to None, nccl will be used if GPUs are requested, else gloo + will be used. + init_method: The initialization method to use. Either "env" + for environment variable initialization or "tcp" for TCP + initialization. Defaults to "env". + timeout_s: Seconds for process group operations to timeout. + """ + + backend: Optional[str] = None + init_method: str = "env" + timeout_s: int = 1800 + + @property + def backend_cls(self): + return _TorchBackend + + @property + def train_func_context(self): + return TorchConfigContextManager + + +def _setup_torch_process_group( + backend: str, + world_rank: int, + world_size: int, + init_method: str, + timeout_s: int = 1800, +): + """Connects the distributed PyTorch backend. + + Args: + backend: The backend (nccl, gloo, etc.) to use for training. + world_rank: Rank of the current worker. + world_size: Number of workers participating in the job. + init_method: URL specifying how to initialize the process group. + timeout_s: Seconds for process group operations to timeout. + """ + if world_rank == 0: + logger.info( + f"Setting up process group for: {init_method} [rank={world_rank}, " + f"world_size={world_size}]" + ) + else: + logger.debug( + f"Setting up process group for: {init_method} [rank={world_rank}, " + f"world_size={world_size}]" + ) + logger.debug(f"using {backend}") + + if backend == "nccl": + # See https://github.com/pytorch/pytorch/blob/c263bd43e8e8502d4726643bc6fd046f0130ac0e/torch/distributed/distributed_c10d.py#L803-L823 # noqa: E501 + # We do not use TORCH_NCCL_BLOCKING_WAIT due to performance overhead. + if Version(torch.__version__) < Version("2.2.0"): + TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR = "NCCL_ASYNC_ERROR_HANDLING" + TORCH_NCCL_BLOCKING_WAIT_ENV_VAR = "NCCL_BLOCKING_WAIT" + else: + TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR = "TORCH_NCCL_ASYNC_ERROR_HANDLING" + TORCH_NCCL_BLOCKING_WAIT_ENV_VAR = "TORCH_NCCL_BLOCKING_WAIT" + if ( + TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR not in os.environ + and TORCH_NCCL_BLOCKING_WAIT_ENV_VAR not in os.environ + ): + logger.debug( + f"Setting {TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR}=1 to fail if NCCL collective communication operations are timing out. " # noqa: E501 + f"To override this behavior, you can set {TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR}=0." # noqa: E501 + ) + os.environ[TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR] = "1" + elif backend == "hccl": + register_custom_torch_dist_backend(backend) + + dist.init_process_group( + backend=backend, + init_method=init_method, + rank=world_rank, + world_size=world_size, + timeout=timedelta(seconds=timeout_s), + ) + + +def _shutdown_torch(destroy_process_group=False): + from ray.air._internal.torch_utils import get_devices + + devices = get_devices() + if destroy_process_group: + dist.destroy_process_group() + if torch.cuda.is_available(): + for device in devices: + with torch.cuda.device(device): + torch.cuda.empty_cache() + + +def _set_torch_distributed_env_vars(): + # Same env vars as in + # https://pytorch.org/docs/stable/elastic/run.html#environment-variables + from ray.train.torch import get_device + + context = ray.train.get_context() + os.environ["LOCAL_RANK"] = str(context.get_local_rank()) + os.environ["RANK"] = str(context.get_world_rank()) + os.environ["LOCAL_WORLD_SIZE"] = str(context.get_local_world_size()) + os.environ["WORLD_SIZE"] = str(context.get_world_size()) + os.environ["NODE_RANK"] = str(context.get_node_rank()) + + # Makes sure Hugging Face Accelerate uses the correct device + device = get_device() + os.environ["ACCELERATE_TORCH_DEVICE"] = str(device) + + +class _TorchBackend(Backend): + share_cuda_visible_devices: bool = True + + def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig): + if dist.is_available(): + # Set the appropriate training backend. + if backend_config.backend is None: + if worker_group.num_gpus_per_worker > 0: + backend = "nccl" + else: + backend = "gloo" + else: + backend = backend_config.backend + + master_addr, master_port = worker_group.execute_single( + 0, get_address_and_port + ) + if backend_config.init_method == "env": + + def set_env_vars(addr, port): + os.environ["MASTER_ADDR"] = addr + os.environ["MASTER_PORT"] = str(port) + + worker_group.execute(set_env_vars, addr=master_addr, port=master_port) + url = "env://" + elif backend_config.init_method == "tcp": + url = f"tcp://{master_addr}:{master_port}" + else: + raise ValueError( + f"The provided init_method (" + f"{backend_config.init_method}) is not supported. Must " + f"be either 'env' or 'tcp'." + ) + + setup_futures = [] + for i in range(len(worker_group)): + setup_futures.append( + worker_group.execute_single_async( + i, + _setup_torch_process_group, + backend=backend, + world_rank=i, + world_size=len(worker_group), + init_method=url, + timeout_s=backend_config.timeout_s, + ) + ) + ray.get(setup_futures) + else: + raise RuntimeError("Distributed torch is not available.") + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchConfig): + worker_group.execute( + _shutdown_torch, + destroy_process_group=len(worker_group) > 1, + ) + + def on_training_start( + self, worker_group: WorkerGroup, backend_config: BackendConfig + ): + worker_group.execute(_set_torch_distributed_env_vars) diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..7b6eeae305185bdd638042a7c1a0386f8d26bd86 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_checkpoint.py @@ -0,0 +1,182 @@ +import os +import tempfile +import warnings +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, Optional + +import torch + +from ray.air._internal.torch_utils import ( + consume_prefix_in_state_dict_if_present_not_in_place, + load_torch_model, +) +from ray.train._internal.framework_checkpoint import FrameworkCheckpoint +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + +ENCODED_DATA_KEY = "torch_encoded_data" + + +@PublicAPI(stability="beta") +class TorchCheckpoint(FrameworkCheckpoint): + """A :class:`~ray.train.Checkpoint` with Torch-specific functionality.""" + + MODEL_FILENAME = "model.pt" + + @classmethod + def from_state_dict( + cls, + state_dict: Dict[str, Any], + *, + preprocessor: Optional["Preprocessor"] = None, + ) -> "TorchCheckpoint": + """Create a :class:`~ray.train.Checkpoint` that stores a model state dictionary. + + .. tip:: + + This is the recommended method for creating + :class:`TorchCheckpoints`. + + Args: + state_dict: The model state dictionary to store in the checkpoint. + preprocessor: A fitted preprocessor to be applied before inference. + + Returns: + A :class:`TorchCheckpoint` containing the specified state dictionary. + + Examples: + + .. testcode:: + + import torch + import torch.nn as nn + from ray.train.torch import TorchCheckpoint + + # Set manual seed + torch.manual_seed(42) + + # Function to create a NN model + def create_model() -> nn.Module: + model = nn.Sequential(nn.Linear(1, 10), + nn.ReLU(), + nn.Linear(10,1)) + return model + + # Create a TorchCheckpoint from our model's state_dict + model = create_model() + checkpoint = TorchCheckpoint.from_state_dict(model.state_dict()) + + # Now load the model from the TorchCheckpoint by providing the + # model architecture + model_from_chkpt = checkpoint.get_model(create_model()) + + # Assert they have the same state dict + assert str(model.state_dict()) == str(model_from_chkpt.state_dict()) + print("worked") + + .. testoutput:: + :hide: + + ... + """ + tempdir = tempfile.mkdtemp() + + model_path = Path(tempdir, cls.MODEL_FILENAME).as_posix() + stripped_state_dict = consume_prefix_in_state_dict_if_present_not_in_place( + state_dict, "module." + ) + torch.save(stripped_state_dict, model_path) + + checkpoint = cls.from_directory(tempdir) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + return checkpoint + + @classmethod + def from_model( + cls, + model: torch.nn.Module, + *, + preprocessor: Optional["Preprocessor"] = None, + ) -> "TorchCheckpoint": + """Create a :class:`~ray.train.Checkpoint` that stores a Torch model. + + .. note:: + + PyTorch recommends storing state dictionaries. To create a + :class:`TorchCheckpoint` from a state dictionary, call + :meth:`~ray.train.torch.TorchCheckpoint.from_state_dict`. To learn more + about state dictionaries, read + `Saving and Loading Models `_. # noqa: E501 + + Args: + model: The Torch model to store in the checkpoint. + preprocessor: A fitted preprocessor to be applied before inference. + + Returns: + A :class:`TorchCheckpoint` containing the specified model. + + Examples: + + .. testcode:: + + from ray.train.torch import TorchCheckpoint + import torch + + # Create model identity and send a random tensor to it + model = torch.nn.Identity() + input = torch.randn(2, 2) + output = model(input) + + # Create a checkpoint + checkpoint = TorchCheckpoint.from_model(model) + print(checkpoint) + + .. testoutput:: + :hide: + + ... + """ + tempdir = tempfile.mkdtemp() + + model_path = Path(tempdir, cls.MODEL_FILENAME).as_posix() + torch.save(model, model_path) + + checkpoint = cls.from_directory(tempdir) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + return checkpoint + + def get_model(self, model: Optional[torch.nn.Module] = None) -> torch.nn.Module: + """Retrieve the model stored in this checkpoint. + + Args: + model: If the checkpoint contains a model state dict, and not + the model itself, then the state dict will be loaded to this + ``model``. Otherwise, the model will be discarded. + """ + with self.as_directory() as tempdir: + model_path = Path(tempdir, self.MODEL_FILENAME).as_posix() + if not os.path.exists(model_path): + raise RuntimeError( + "`model.pt` not found within this checkpoint. Make sure you " + "created this `TorchCheckpoint` from one of its public " + "constructors (`from_state_dict` or `from_model`)." + ) + model_or_state_dict = torch.load(model_path, map_location="cpu") + + if isinstance(model_or_state_dict, torch.nn.Module): + if model: + warnings.warn( + "TorchCheckpoint already contains all information needed. " + "Discarding provided `model` argument. If you are using " + "TorchPredictor directly, you should do " + "`TorchPredictor.from_checkpoint(checkpoint)` by removing kwargs " + "`model=`." + ) + model = load_torch_model( + saved_model=model_or_state_dict, model_definition=model + ) + return model diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_detection_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_detection_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..98a708a731ca2976d57c7866dfc0707d5c255545 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_detection_predictor.py @@ -0,0 +1,90 @@ +import collections +from typing import Dict, List, Optional, Union + +import numpy as np +import torch + +from ray.train._internal.dl_predictor import TensorDtype +from ray.train.torch.torch_predictor import TorchPredictor +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class TorchDetectionPredictor(TorchPredictor): + """A predictor for TorchVision detection models. + + Unlike other Torch models, instance segmentation models return + `List[Dict[str, Tensor]]`. This predictor extends :class:`TorchPredictor` to support + the non-standard outputs. + + To learn more about instance segmentation models, read + `Instance segmentation models `_. + + Example: + + .. testcode:: + + import numpy as np + from torchvision import models + + from ray.train.torch import TorchDetectionPredictor + + model = models.detection.fasterrcnn_resnet50_fpn_v2(pretrained=True) + + predictor = TorchDetectionPredictor(model=model) + predictions = predictor.predict(np.zeros((4, 3, 32, 32), dtype=np.float32)) + + print(predictions.keys()) + + .. testoutput:: + + dict_keys(['pred_boxes', 'pred_labels', 'pred_scores']) + + """ # noqa: E501 + + def _predict_numpy( + self, + data: Union[np.ndarray, Dict[str, np.ndarray]], + dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]], + ) -> Dict[str, np.ndarray]: + if isinstance(data, dict) and len(data) != 1: + raise ValueError( + f"""Expected input to contain one key, but got {len(data)} instead.""" + ) + + if dtype is not None and not isinstance(dtype, torch.dtype): + raise ValueError( + "Expected `dtype` to be a `torch.dtype`, but got a " + f"{type(dtype).__name__} instead." + ) + + if isinstance(data, dict): + images = next(iter(data.values())) + else: + images = data + + inputs = [ + torch.as_tensor(image, dtype=dtype).to(self.device) for image in images + ] + outputs = self.call_model(inputs) + outputs = _convert_outputs_to_batch(outputs) + outputs = {"pred_" + key: value for key, value in outputs.items()} + + return outputs + + +def _convert_outputs_to_batch( + outputs: List[Dict[str, torch.Tensor]], +) -> Dict[str, List[torch.Tensor]]: + """Batch detection model outputs. + + TorchVision detection models return `List[Dict[Tensor]]`. Each `Dict` contain + 'boxes', 'labels, and 'scores'. + + This function batches values and returns a `Dict[str, List[Tensor]]`. + """ # noqa: E501 + batch = collections.defaultdict(list) + for output in outputs: + for key, value in output.items(): + batch[key].append(value.cpu().detach()) + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..be81d5b0f68c0a397239940bc9b310fc79a0b864 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_predictor.py @@ -0,0 +1,250 @@ +import logging +from typing import TYPE_CHECKING, Dict, Optional, Union + +import numpy as np +import torch + +from ray.air._internal.torch_utils import convert_ndarray_batch_to_torch_tensor_batch +from ray.train._internal.dl_predictor import DLPredictor +from ray.train.predictor import DataBatchType +from ray.train.torch import TorchCheckpoint +from ray.util import log_once +from ray.util.annotations import DeveloperAPI, PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="beta") +class TorchPredictor(DLPredictor): + """A predictor for PyTorch models. + + Args: + model: The torch module to use for predictions. + preprocessor: A preprocessor used to transform data batches prior + to prediction. + use_gpu: If set, the model will be moved to GPU on instantiation and + prediction happens on GPU. + """ + + def __init__( + self, + model: torch.nn.Module, + preprocessor: Optional["Preprocessor"] = None, + use_gpu: bool = False, + ): + self.model = model + self.model.eval() + self.use_gpu = use_gpu + + if use_gpu: + # TODO (jiaodong): #26249 Use multiple GPU devices with sharded input + self.device = torch.device("cuda") + else: + self.device = torch.device("cpu") + + # Ensure input tensor and model live on the same device + self.model.to(self.device) + + if ( + not use_gpu + and torch.cuda.device_count() > 0 + and log_once("torch_predictor_not_using_gpu") + ): + logger.warning( + "You have `use_gpu` as False but there are " + f"{torch.cuda.device_count()} GPUs detected on host where " + "prediction will only use CPU. Please consider explicitly " + "setting `TorchPredictor(use_gpu=True)` or " + "`batch_predictor.predict(ds, num_gpus_per_worker=1)` to " + "enable GPU prediction." + ) + + super().__init__(preprocessor) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(model={self.model!r}, " + f"preprocessor={self._preprocessor!r}, use_gpu={self.use_gpu!r})" + ) + + @classmethod + def from_checkpoint( + cls, + checkpoint: TorchCheckpoint, + model: Optional[torch.nn.Module] = None, + use_gpu: bool = False, + ) -> "TorchPredictor": + """Instantiate the predictor from a TorchCheckpoint. + + Args: + checkpoint: The checkpoint to load the model and preprocessor from. + model: If the checkpoint contains a model state dict, and not + the model itself, then the state dict will be loaded to this + ``model``. If the checkpoint already contains the model itself, + this model argument will be discarded. + use_gpu: If set, the model will be moved to GPU on instantiation and + prediction happens on GPU. + """ + model = checkpoint.get_model(model) + preprocessor = checkpoint.get_preprocessor() + return cls(model=model, preprocessor=preprocessor, use_gpu=use_gpu) + + @DeveloperAPI + def call_model( + self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]] + ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: + """Runs inference on a single batch of tensor data. + + This method is called by `TorchPredictor.predict` after converting the + original data batch to torch tensors. + + Override this method to add custom logic for processing the model input or + output. + + Args: + inputs: A batch of data to predict on, represented as either a single + PyTorch tensor or for multi-input models, a dictionary of tensors. + + Returns: + The model outputs, either as a single tensor or a dictionary of tensors. + + Example: + + .. testcode:: + + import numpy as np + import torch + from ray.train.torch import TorchPredictor + + # List outputs are not supported by default TorchPredictor. + # So let's define a custom TorchPredictor and override call_model + class MyModel(torch.nn.Module): + def forward(self, input_tensor): + return [input_tensor, input_tensor] + + # Use a custom predictor to format model output as a dict. + class CustomPredictor(TorchPredictor): + def call_model(self, inputs): + model_output = super().call_model(inputs) + return { + str(i): model_output[i] for i in range(len(model_output)) + } + + # create our data batch + data_batch = np.array([1, 2]) + # create custom predictor and predict + predictor = CustomPredictor(model=MyModel()) + predictions = predictor.predict(data_batch) + print(f"Predictions: {predictions.get('0')}, {predictions.get('1')}") + + .. testoutput:: + + Predictions: [1 2], [1 2] + + """ + with torch.no_grad(): + output = self.model(inputs) + return output + + def predict( + self, + data: DataBatchType, + dtype: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None, + ) -> DataBatchType: + """Run inference on data batch. + + If the provided data is a single array or a dataframe/table with a single + column, it will be converted into a single PyTorch tensor before being + inputted to the model. + + If the provided data is a multi-column table or a dict of numpy arrays, + it will be converted into a dict of tensors before being inputted to the + model. This is useful for multi-modal inputs (for example your model accepts + both image and text). + + Args: + data: A batch of input data of ``DataBatchType``. + dtype: The dtypes to use for the tensors. Either a single dtype for all + tensors or a mapping from column name to dtype. + + Returns: + DataBatchType: Prediction result. The return type will be the same as the + input type. + + Example: + + .. testcode:: + + import numpy as np + import pandas as pd + import torch + import ray + from ray.train.torch import TorchPredictor + + # Define a custom PyTorch module + class CustomModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(1, 1) + self.linear2 = torch.nn.Linear(1, 1) + + def forward(self, input_dict: dict): + out1 = self.linear1(input_dict["A"].unsqueeze(1)) + out2 = self.linear2(input_dict["B"].unsqueeze(1)) + return out1 + out2 + + # Set manul seed so we get consistent output + torch.manual_seed(42) + + # Use Standard PyTorch model + model = torch.nn.Linear(2, 1) + predictor = TorchPredictor(model=model) + # Define our data + data = np.array([[1, 2], [3, 4]]) + predictions = predictor.predict(data, dtype=torch.float) + print(f"Standard model predictions: {predictions}") + print("---") + + # Use Custom PyTorch model with TorchPredictor + predictor = TorchPredictor(model=CustomModule()) + # Define our data and predict Customer model with TorchPredictor + data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + predictions = predictor.predict(data, dtype=torch.float) + print(f"Custom model predictions: {predictions}") + + .. testoutput:: + + Standard model predictions: {'predictions': array([[1.5487633], + [3.8037925]], dtype=float32)} + --- + Custom model predictions: predictions + 0 [0.61623406] + 1 [2.857038] + """ + return super(TorchPredictor, self).predict(data=data, dtype=dtype) + + def _arrays_to_tensors( + self, + numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtype: Optional[Union[torch.dtype, Dict[str, torch.dtype]]], + ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: + return convert_ndarray_batch_to_torch_tensor_batch( + numpy_arrays, + dtypes=dtype, + device=self.device, + ) + + def _tensor_to_array(self, tensor: torch.Tensor) -> np.ndarray: + if not isinstance(tensor, torch.Tensor): + raise ValueError( + "Expected the model to return either a torch.Tensor or a " + f"dict of torch.Tensor, but got {type(tensor)} instead. " + f"To support models with different output types, subclass " + f"TorchPredictor and override the `call_model` method to " + f"process the output into either torch.Tensor or Dict[" + f"str, torch.Tensor]." + ) + return tensor.cpu().detach().numpy() diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/torch_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..8df6a6cdbe80016c528bd5a1ac963af86eb9ff64 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/torch_trainer.py @@ -0,0 +1,204 @@ +from typing import Any, Callable, Dict, Optional, Union + +from ray.train import Checkpoint, DataConfig, RunConfig, ScalingConfig +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.train.torch.config import TorchConfig +from ray.train.trainer import GenDataset +from ray.util import PublicAPI + + +@PublicAPI(stability="stable") +class TorchTrainer(DataParallelTrainer): + """A Trainer for data parallel PyTorch training. + + At a high level, this Trainer does the following: + + 1. Launches multiple workers as defined by the ``scaling_config``. + 2. Sets up a distributed PyTorch environment + on these workers as defined by the ``torch_config``. + 3. Ingests the input ``datasets`` based on the ``dataset_config``. + 4. Runs the input ``train_loop_per_worker(train_loop_config)`` + on all workers. + + For more details, see: + + * :ref:`PyTorch Guide ` + * :ref:`PyTorch Lightning Guide ` + * :ref:`Hugging Face Transformers Guide ` + + Example: + + .. testcode:: + + import os + import tempfile + + import torch + from torch import nn + from torch.nn.parallel import DistributedDataParallel + + import ray + from ray.train import Checkpoint, CheckpointConfig, RunConfig, ScalingConfig + from ray.train.torch import TorchTrainer + + # If using GPUs, set this to True. + use_gpu = False + # Number of processes to run training on. + num_workers = 4 + + # Define your network structure. + class NeuralNetwork(nn.Module): + def __init__(self): + super(NeuralNetwork, self).__init__() + self.layer1 = nn.Linear(1, 32) + self.relu = nn.ReLU() + self.layer2 = nn.Linear(32, 1) + + def forward(self, input): + return self.layer2(self.relu(self.layer1(input))) + + # Training loop. + def train_loop_per_worker(config): + + # Read configurations. + lr = config["lr"] + batch_size = config["batch_size"] + num_epochs = config["num_epochs"] + + # Fetch training dataset. + train_dataset_shard = ray.train.get_dataset_shard("train") + + # Instantiate and prepare model for training. + model = NeuralNetwork() + model = ray.train.torch.prepare_model(model) + + # Define loss and optimizer. + loss_fn = nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + # Create data loader. + dataloader = train_dataset_shard.iter_torch_batches( + batch_size=batch_size, dtypes=torch.float + ) + + # Train multiple epochs. + for epoch in range(num_epochs): + + # Train epoch. + for batch in dataloader: + output = model(batch["input"]) + loss = loss_fn(output, batch["label"]) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Create checkpoint. + base_model = (model.module + if isinstance(model, DistributedDataParallel) else model) + checkpoint_dir = tempfile.mkdtemp() + torch.save( + {"model_state_dict": base_model.state_dict()}, + os.path.join(checkpoint_dir, "model.pt"), + ) + checkpoint = Checkpoint.from_directory(checkpoint_dir) + + # Report metrics and checkpoint. + ray.train.report({"loss": loss.item()}, checkpoint=checkpoint) + + + # Define configurations. + train_loop_config = {"num_epochs": 20, "lr": 0.01, "batch_size": 32} + scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu) + run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=1)) + + # Define datasets. + train_dataset = ray.data.from_items( + [{"input": [x], "label": [2 * x + 1]} for x in range(2000)] + ) + datasets = {"train": train_dataset} + + # Initialize the Trainer. + trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + scaling_config=scaling_config, + run_config=run_config, + datasets=datasets + ) + + # Train the model. + result = trainer.fit() + + # Inspect the results. + final_loss = result.metrics["loss"] + + .. testoutput:: + :hide: + + ... + + Args: + + train_loop_per_worker: The training function to execute on each worker. + This function can either take in zero arguments or a single ``Dict`` + argument which is set by defining ``train_loop_config``. + Within this function you can use any of the + :ref:`Ray Train Loop utilities `. + train_loop_config: A configuration ``Dict`` to pass in as an argument to + ``train_loop_per_worker``. + This is typically used for specifying hyperparameters. Passing large + datasets via `train_loop_config` is not recommended and may introduce + large overhead and unknown issues with serialization and deserialization. + torch_config: The configuration for setting up the PyTorch Distributed backend. + If set to None, a default configuration will be used in which + GPU training uses NCCL and CPU training uses Gloo. + scaling_config: The configuration for how to scale data parallel training. + ``num_workers`` determines how many Python processes are used for training, + and ``use_gpu`` determines whether or not each process should use GPUs. + See :class:`~ray.train.ScalingConfig` for more info. + run_config: The configuration for the execution of the training run. + See :class:`~ray.train.RunConfig` for more info. + datasets: The Ray Datasets to ingest for training. + Datasets are keyed by name (``{name: dataset}``). + Each dataset can be accessed from within the ``train_loop_per_worker`` + by calling ``ray.train.get_dataset_shard(name)``. + Sharding and additional configuration can be done by + passing in a ``dataset_config``. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Dataset are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + resume_from_checkpoint: A checkpoint to resume training from. + This checkpoint can be accessed from within ``train_loop_per_worker`` + by calling ``ray.train.get_checkpoint()``. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + torch_config: Optional[TorchConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + dataset_config: Optional[DataConfig] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + if not torch_config: + torch_config = TorchConfig() + + super(TorchTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=torch_config, + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/train_loop_utils.py b/.venv/lib/python3.11/site-packages/ray/train/torch/train_loop_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..465eed45a4a894f301ae6d585bfdbeac3839d6e4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/train_loop_utils.py @@ -0,0 +1,774 @@ +import collections +import logging +import os +import random +import types +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import torch +from packaging.version import Version +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel +from torch.optim import Optimizer +from torch.utils.data import ( + DataLoader, + DistributedSampler, + IterableDataset, + RandomSampler, + SequentialSampler, +) + +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray.air._internal.device_manager import ( + get_torch_device_manager_by_context, + get_torch_device_manager_by_device_type, +) +from ray.train._internal import session +from ray.train._internal.accelerator import Accelerator +from ray.train._internal.session import get_accelerator, set_accelerator +from ray.util.annotations import Deprecated, PublicAPI + +if Version(torch.__version__) < Version("1.11.0"): + FullyShardedDataParallel = None +else: + from torch.distributed.fsdp import FullyShardedDataParallel + +try: + from torch.profiler import profile +except ImportError: + profile = None + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="stable") +def get_device() -> torch.device: + """Gets the correct torch device configured for this process. + + Returns the torch device for the current worker. If more than 1 GPU is + requested per worker, returns the device with the minimal device index. + + .. note:: + + If you requested multiple GPUs per worker, and want to get + the full list of torch devices, please use + :meth:`~ray.train.torch.get_devices`. + + Assumes that `CUDA_VISIBLE_DEVICES` is set and is a + superset of the `ray.get_gpu_ids()`. + + Examples: + + Example: Launched 2 workers on the current node, each with 1 GPU + + .. testcode:: + :skipif: True + + os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" + ray.get_gpu_ids() == [2] + torch.cuda.is_available() == True + get_device() == torch.device("cuda:0") + + Example: Launched 4 workers on the current node, each with 1 GPU + + .. testcode:: + :skipif: True + + os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + ray.get_gpu_ids() == [2] + torch.cuda.is_available() == True + get_device() == torch.device("cuda:2") + + Example: Launched 2 workers on the current node, each with 2 GPUs + + .. testcode:: + :skipif: True + + os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + ray.get_gpu_ids() == [2,3] + torch.cuda.is_available() == True + get_device() == torch.device("cuda:2") + + + You can move a model to device by: + + .. testcode:: + :skipif: True + + model.to(ray.train.torch.get_device()) + + Instead of manually checking the device type: + + .. testcode:: + :skipif: True + + model.to("cuda" if torch.cuda.is_available() else "cpu") + """ + from ray.air._internal import torch_utils + + record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICE, "1") + return torch_utils.get_devices()[0] + + +@PublicAPI(stability="beta") +def get_devices() -> List[torch.device]: + """Gets the correct torch device list configured for this process. + + Assumes that `CUDA_VISIBLE_DEVICES` is set and is a + superset of the `ray.get_gpu_ids()`. + + + Examples: + + Example: Launched 2 workers on the current node, each with 1 GPU + + .. testcode:: + :skipif: True + + os.environ["CUDA_VISIBLE_DEVICES"] == "2,3" + ray.get_gpu_ids() == [2] + torch.cuda.is_available() == True + get_devices() == [torch.device("cuda:0")] + + Example: Launched 4 workers on the current node, each with 1 GPU + + .. testcode:: + :skipif: True + + os.environ["CUDA_VISIBLE_DEVICES"] == "0,1,2,3" + ray.get_gpu_ids() == [2] + torch.cuda.is_available() == True + get_devices() == [torch.device("cuda:2")] + + Example: Launched 2 workers on the current node, each with 2 GPUs + + .. testcode:: + :skipif: True + + os.environ["CUDA_VISIBLE_DEVICES"] == "0,1,2,3" + ray.get_gpu_ids() == [2,3] + torch.cuda.is_available() == True + get_devices() == [torch.device("cuda:2"), torch.device("cuda:3")] + """ + + from ray.air._internal import torch_utils + + record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICES, "1") + return torch_utils.get_devices() + + +@PublicAPI(stability="stable") +def prepare_model( + model: torch.nn.Module, + move_to_device: Union[bool, torch.device] = True, + parallel_strategy: Optional[str] = "ddp", + parallel_strategy_kwargs: Optional[Dict[str, Any]] = None, +) -> torch.nn.Module: + """Prepares the model for distributed execution. + + This allows you to use the same exact code regardless of number of + workers or the device type being used (CPU, GPU). + + Args: + model (torch.nn.Module): A torch model to prepare. + move_to_device: Either a boolean indiciating whether to move + the model to the correct device or an actual device to + move the model to. If set to False, the model needs + to manually be moved to the correct device. + parallel_strategy ("ddp", "fsdp", or None): Whether to wrap models + in ``DistributedDataParallel``, ``FullyShardedDataParallel``, + or neither. + parallel_strategy_kwargs (Dict[str, Any]): Args to pass into + ``DistributedDataParallel`` or ``FullyShardedDataParallel`` + initialization if ``parallel_strategy`` is set to "ddp" + or "fsdp", respectively. + """ + + if parallel_strategy == "fsdp" and FullyShardedDataParallel is None: + raise ImportError( + "FullyShardedDataParallel requires torch>=1.11.0. " + "Run `pip install 'torch>=1.11.0'` to use FullyShardedDataParallel." + ) + + record_extra_usage_tag(TagKey.TRAIN_TORCH_PREPARE_MODEL, "1") + return get_accelerator(_TorchAccelerator).prepare_model( + model, + move_to_device=move_to_device, + parallel_strategy=parallel_strategy, + parallel_strategy_kwargs=parallel_strategy_kwargs, + ) + + +@PublicAPI(stability="stable") +def prepare_data_loader( + data_loader: torch.utils.data.DataLoader, + add_dist_sampler: bool = True, + move_to_device: bool = True, + auto_transfer: bool = True, +) -> torch.utils.data.DataLoader: + """Prepares :class:`~torch.utils.data.DataLoader` for distributed execution. + + This allows you to use the same exact code regardless of number of + workers or the device type being used (CPU, GPU). + + .. note:: + + This method adds a `DistributedSampler` to the `DataLoader` if the + number of training workers is greater than 1. If shuffling is + enabled on the original `DataLoader`, then `shuffle=True` will also + be passed into the `DistributedSampler` constructor. `shuffle=False` + on the original `DataLoader` also means that shuffling is disabled + on the sampler. + + With more than 1 worker, calling the `DistributedSampler.set_epoch` method + at the beginning of each epoch before creating the DataLoader iterator + is necessary to make shuffling work properly across multiple epochs. + Otherwise, the same ordering will be always used. + See: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler # noqa: E501 + + Example: + + .. testcode: + :skipif: True + + import torch + + import ray.train.torch + + train_dataloader = torch.utils.data.DataLoader( + ..., batch_size=..., shuffle=True + ) + train_dataloader = ray.train.torch.prepare_data_loader(train_loader) + + for epoch in range(10): + if ray.train.get_context().get_world_size() > 1: + # Required for the distributed sampler to shuffle properly across epochs + train_dataloader.sampler.set_epoch(epoch) + + for X, y in train_loader: + # No need to move data to GPU, this is done by `prepare_data_loader`! + # X, y = X.to("cuda"), y.to("cuda") + ... + + Args: + data_loader (torch.utils.data.DataLoader): The DataLoader to + prepare. + add_dist_sampler: Whether to add a DistributedSampler to + the provided DataLoader. + move_to_device: If set, automatically move the data + returned by the data loader to the correct device. + auto_transfer: If set and device is GPU, another CUDA stream + is created to automatically copy data from host (CPU) memory + to device (GPU) memory (the default CUDA stream still runs the + training procedure). If device is CPU, it will be disabled + regardless of the setting. This configuration will be ignored + if ``move_to_device`` is False. + """ + record_extra_usage_tag(TagKey.TRAIN_TORCH_PREPARE_DATALOADER, "1") + return get_accelerator(_TorchAccelerator).prepare_data_loader( + data_loader, + add_dist_sampler=add_dist_sampler, + move_to_device=move_to_device, + auto_transfer=auto_transfer, + ) + + +@PublicAPI(stability="beta") +def accelerate(amp: bool = False) -> None: + """Enables training optimizations. + + Arguments: + amp: If true, perform training with automatic mixed precision. + Otherwise, use full precision. + + .. warning:: ``train.torch.accelerate`` cannot be called more than once, and it + must be called before any other ``train.torch`` utility function. + """ + try: + set_accelerator(_TorchAccelerator(amp=amp)) + except RuntimeError: + raise RuntimeError( + "An accelerator has already been set. Make sure " + "`train.torch.accelerate()` is not called multiple times, and is called " + "before any of the prepare methods." + ) + + +@PublicAPI(stability="beta") +def prepare_optimizer(optimizer: torch.optim.Optimizer) -> torch.optim.Optimizer: + """Wraps optimizer to support automatic mixed precision. + + Args: + optimizer (torch.optim.Optimizer): The DataLoader to prepare. + + Returns: + A wrapped optimizer. + """ + return get_accelerator(_TorchAccelerator).prepare_optimizer(optimizer) + + +@PublicAPI(stability="beta") +def backward(tensor: torch.Tensor) -> None: + """Computes the gradient of the specified tensor w.r.t. graph leaves. + + Args: + tensor (torch.Tensor): Tensor of which the derivative will be computed. + """ + get_accelerator(_TorchAccelerator).backward(tensor) + + +@PublicAPI(stability="stable") +def enable_reproducibility(seed: int = 0) -> None: + """Limits sources of nondeterministic behavior. + + This function: + + * Seeds PyTorch, Python, and NumPy. + * Disables CUDA convolution benchmarking. + * Configures PyTorch to use determinstic algorithms. + * Seeds workers spawned for multi-process data loading. + + Args: + seed: The number to seed libraries and data workers with. + + .. warning:: ``train.torch.enable_reproducibility()`` can't guarantee + completely reproducible results across executions. To learn more, read + the `PyTorch notes on randomness + `_. + """ + get_accelerator(_TorchAccelerator).enable_reproducibility(seed) + + +@Deprecated +class TorchWorkerProfiler: + """Utility class for running PyTorch Profiler on a Train worker. + + Args: + trace_dir (Optional[str]): The directory to store traces on the + worker node. If ``None``, this will use a default temporary dir. + """ + + WORKER_TRACE_DIR_NAME = "pytorch_profiler_worker_traces" + + def __init__(self, trace_dir: Optional[str] = None): + raise DeprecationWarning( + "The `ray.train.torch.TorchWorkerProfiler` API is deprecated in Ray 2.0.", + ) + + +class _TorchAccelerator(Accelerator): + """A utility that implements methods to accelerate PyTorch training. + + Arguments: + amp: If true, perform training with automatic mixed precision. + Otherwise, use full precision. + """ + + def __init__(self, amp: bool = False): + self.amp_is_enabled = amp + self.scaler = GradScaler() if amp else None + self._seed = None + self.device_manager = get_torch_device_manager_by_context() + + def prepare_model( + self, + model: torch.nn.Module, + move_to_device: bool = True, + parallel_strategy: Optional[str] = "ddp", + parallel_strategy_kwargs: Optional[Dict[str, Any]] = None, + ) -> torch.nn.Module: + """Prepares the model for distributed execution. + + This allows you to use the same exact code regardless of number of + workers or the device type being used (CPU, GPU). + + Args: + model (torch.nn.Module): A torch model to prepare. + move_to_device: Whether to move the model to the correct + device. If set to False, the model needs to manually be moved + to the correct device. + parallel_strategy ("ddp", "fsdp", or None): Whether to wrap models + in ``DistributedDataParallel``, ``FullyShardedDataParallel`` ( + Experimental), or neither. + parallel_strategy_kwargs (Dict[str, Any]): Args to pass into + ``DistributedDataParallel`` or ``FullyShardedDataParallel`` + initialization if ``parallel_strategy`` is set to "ddp" + or "fsdp", respectively. + """ + parallel_strategy_kwargs = parallel_strategy_kwargs or {} + + rank = session.get_local_rank() + + if isinstance(move_to_device, torch.device): + device = move_to_device + else: + device = get_device() + if isinstance(device, list): + device = device[0] + + if self.device_manager.is_available(): + self.device_manager.set_device(device) + + if move_to_device: + if rank == 0: + logger.info(f"Moving model to device: {device}") + else: + logger.debug(f"Moving model to device: {device}") + model = model.to(device) + + def model_get_state(self): + # `__getstate__` is an special method that informs pickle which attributes + # to serialize. This custom implementation ensures that the wrapped forward + # method and custom `__getstate__` method aren't serialized. + if hasattr(self, "_original_get_state"): + state = self._original_get_state() + state["__getstate__"] = state["_original_get_state"] + del state["_original_get_state"] + else: + # If model does not have a `__getstate__` already defined, use default + # implementation. + state = self.__dict__.copy() + del state["__getstate__"] + state["forward"] = state["_unwrapped_forward"] + del state["_unwrapped_forward"] + + return state + + if self.amp_is_enabled: + # Pickle cannot serialize the wrapped forward method. As a workaround, + # define a custom `__getstate__` method that unwraps the forward method. + model._unwrapped_forward = model.forward + model.forward = autocast()(model.forward) + + # TODO(amogkam): Replace below logic with a generic "unpack model" method. + # Replacing the `model.forward` method makes the model no longer + # serializable. When serializing the model, we have to override the + # `__getstate__` method to set back the original forward method. + if hasattr(model, "__getstate__"): + model._original_get_state = model.__getstate__ + # `__getstate__` must be a bound method rather than an callable attribute. + # See https://stackoverflow.com/questions/972/adding-a-method-to-an-existing-object-instance. # noqa: E501 + model.__getstate__ = types.MethodType(model_get_state, model) + + world_size = session.get_world_size() + + if parallel_strategy and world_size > 1: + if parallel_strategy == "ddp": + DataParallel = DistributedDataParallel + if self.device_manager.is_available() and device.type != "cpu": + parallel_strategy_kwargs = { + "device_ids": [device], + "output_device": device, + **parallel_strategy_kwargs, + } + else: + if not torch.cuda.is_available(): + raise RuntimeError( + "FSDP is only available with GPU-enabled " + "training. Set " + "`use_gpu=True` in your Trainer to train with " + "GPUs." + ) + DataParallel = FullyShardedDataParallel + if rank == 0: + logger.info(f"Wrapping provided model in {DataParallel.__name__}.") + else: + logger.debug(f"Wrapping provided model in {DataParallel.__name__}.") + model = DataParallel(model, **parallel_strategy_kwargs) + + return model + + def prepare_data_loader( + self, + data_loader: torch.utils.data.DataLoader, + add_dist_sampler: bool = True, + move_to_device: bool = True, + auto_transfer: bool = False, + ) -> torch.utils.data.DataLoader: + """Prepares DataLoader for distributed execution. + + This allows you to use the same exact code regardless of number of + workers or the device type being used (CPU, GPU). + + Args: + data_loader (torch.utils.data.DataLoader): The DataLoader to + prepare. + add_dist_sampler: Whether to add a DistributedSampler to + the provided DataLoader. + move_to_device: If set, automatically move the data + returned by the data loader to the correct device. + auto_transfer: (Experimental) If set and device is GPU, another CUDA stream + is created to automatically copy data from host (CPU) memory + to device (GPU) memory (the default CUDA stream still runs the + training procedure). If device is CPU, it will be disabled + regardless of the setting. This configuration will be ignored + if ``move_to_device`` is False. + """ + + world_size = session.get_world_size() + world_rank = session.get_world_rank() + + # Only add Distributed Sampler if the following conditions hold: + # 1. More than one training worker is being used. + # 2. A DistributedSampler has not already been added by the user. + # 3. The dataset is not an IterableDataset. Samplers do not worker with + # IterableDatasets. + if ( + world_size > 1 + and not isinstance(data_loader.sampler, DistributedSampler) + and not ( + hasattr(data_loader, "dataset") + and isinstance(data_loader.dataset, IterableDataset) + ) + and add_dist_sampler + ): + + def with_sampler(loader): + # Automatically set the DistributedSampler + + # If you're using a sampler, the DataLoader shuffle flag must be set to + # False. Shuffling is instead determined by the shuffle argument passed + # to the DistributedSampler constructor. + + # If no sampler is passed to the DataLoader constructor, Torch + # constructs a default sampler. The default sampler is a RandomSampler + # if shuffling is enabled and a SequentialSampler otherwise. DataLoader + # does not have a shuffle attribute, so we instead identify whether + # shuffling is enabled by checking the default sampler type. + shuffle = not isinstance(loader.sampler, SequentialSampler) + + def seeded_worker_init_fn( + worker_init_fn: Optional[Callable[[int], None]] + ): + def wrapper(worker_id: int): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + if worker_init_fn: + worker_init_fn(worker_id) + + return wrapper + + worker_init_fn: Optional[Callable[[int], None]] = loader.worker_init_fn + generator: Optional[torch.Generator] = loader.generator + if self._seed is not None: + worker_init_fn = seeded_worker_init_fn(worker_init_fn) + generator = torch.Generator() + generator.manual_seed(self._seed) + + using_default_sampler = isinstance( + loader.sampler, (SequentialSampler, RandomSampler) + ) + if not using_default_sampler and world_rank == 0: + logger.warn( + f"The {loader.sampler.__class__.__name__} will be overwritten " + "with a DistributedSampler. You can disable this by setting " + "`with_sampler` to False in `prepare_data_loader`." + ) + + data_loader_args = { + "dataset": loader.dataset, + "batch_size": loader.batch_size, + "shuffle": False, + "num_workers": loader.num_workers, + "collate_fn": loader.collate_fn, + "pin_memory": loader.pin_memory, + "drop_last": loader.drop_last, + "timeout": loader.timeout, + "worker_init_fn": worker_init_fn, + "generator": generator, + "sampler": DistributedSampler(loader.dataset, shuffle=shuffle), + } + return DataLoader(**data_loader_args) + + data_loader = with_sampler(data_loader) + + if move_to_device: + device = get_device() + data_loader = _WrappedDataLoader(data_loader, device, auto_transfer) + + return data_loader + + def prepare_optimizer(self, optimizer: Optimizer) -> Optimizer: + """Wraps optimizer to support automatic mixed precision. + + Args: + optimizer (torch.optim.Optimizer): The DataLoader to prepare. + + Returns: + A wrapped optimizer. + """ + return _WrappedOptimizer(optimizer, scaler=self.scaler) + + def backward(self, tensor: torch.Tensor) -> None: + """Computes the gradient of the specified tensor w.r.t. graph leaves. + + Args: + tensor (torch.Tensor): Tensor of which the derivative will be computed. + """ + if self.amp_is_enabled: + self.scaler.scale(tensor).backward() + else: + tensor.backward() + + def enable_reproducibility(self, seed: int = 0) -> None: + """Limits sources of nondeterministic behavior.""" + self._seed = seed + + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + + torch.use_deterministic_algorithms(True) + torch.backends.cudnn.benchmark = False + + # If you want to use deterministic algorithms with CUDA, then you need to set + # the CUBLAS_WORKSPACE_CONFIG environment variable; otherwise, Torch errors. + # See https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility. + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + + +class _WrappedDataLoader(DataLoader): + def __init__( + self, base_dataloader: DataLoader, device: torch.device, auto_transfer: bool + ): + self.__dict__.update(getattr(base_dataloader, "__dict__", {})) + self._dataloader = base_dataloader + self.dataloader_iter = None + self.device = device + + self.device_manager = get_torch_device_manager_by_device_type(device.type) + + # disable auto transfer (host->device) if cpu is used + if device.type != "cpu" and self.device_manager.supports_stream(): + self._auto_transfer = auto_transfer + else: + self._auto_transfer = False + # create a new device stream to move data from host to device concurrently + self._memcpy_stream = ( + self.device_manager.create_stream(device) + if device.type != "cpu" and self._auto_transfer + else None + ) + self.next_batch = None + + def _move_to_device(self, item): + if item is None: + return None + + def try_move_device(i): + try: + i = i.to(self.device, non_blocking=self._auto_transfer) + except AttributeError: + logger.debug(f"Item {i} cannot be moved to device " f"{self.device}.") + return i + + with self.device_manager.get_stream_context(self._memcpy_stream): + if isinstance(item, collections.abc.Mapping): + item_on_device = {k: self._move_to_device(v) for k, v in item.items()} + elif isinstance(item, tuple): + item_on_device = tuple(self._move_to_device(i) for i in item) + elif isinstance(item, list): + item_on_device = [self._move_to_device(i) for i in item] + elif isinstance(item, torch.Tensor): + item_on_device = try_move_device(item) + else: + logger.debug( + f"Data type {type(item)} doesn't support being moved to device." + ) + item_on_device = item + + return item_on_device + + def _wait_for_batch(self, item): + if self._memcpy_stream is None: + return + # Reference: + # https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html + # The training stream (current) needs to wait until + # the memory copy stream finishes. + curr_stream = self.device_manager.get_current_stream() + curr_stream.wait_stream(self._memcpy_stream) + # When a tensor is used by CUDA streams different from + # its original allocator, we need to call ``record_stream`` + # to inform the allocator of all these streams. Otherwise, + # the tensor might be freed once it is no longer used by + # the creator stream. + for i in item: + # The Pytorch DataLoader has no restrictions on what is outputted for + # each batch. We should only ``record_stream`` if the item has the + # ability to do so. + try: + i.record_stream(curr_stream) + except AttributeError: + pass + + def __len__(self): + return len(self._dataloader) + + def _prefetch_next_batch(self): + next_batch = next(self.dataloader_iter, None) + self.next_batch = self._move_to_device(next_batch) + + def __iter__(self): + self.dataloader_iter = iter(self._dataloader) + self._prefetch_next_batch() + return self + + def __next__(self): + next_batch = self.next_batch + if next_batch is None: + raise StopIteration + self._wait_for_batch(next_batch) + self._prefetch_next_batch() + return next_batch + + +class _WrappedOptimizer(Optimizer): + def __init__(self, optimizer: Optimizer, scaler: Optional[GradScaler] = None): + self.optimizer = optimizer + self.scaler = scaler + + @property + def state(self): + return self.optimizer.state + + @state.setter + def state(self, state): + self.optimizer.state = state + + @property + def param_groups(self): + return self.optimizer.param_groups + + @param_groups.setter + def param_groups(self, param_groups): + self.optimizer.param_groups = param_groups + + @property + def defaults(self): + return self.optimizer.defaults + + @defaults.setter + def defaults(self, defaults): + self.optimizer.defaults = defaults + + def add_param_group(self, param_group): + self.optimizer.add_param_group(param_group) + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict) + + def state_dict(self): + return self.optimizer.state_dict() + + def zero_grad(self): + self.optimizer.zero_grad() + + def step(self, closure=None): + if self.scaler is not None: + self.scaler.step(self.optimizer, closure) + self.scaler.update() + else: + self.optimizer.step(closure) diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea32abc8c9d7b5514b3c5680e06ee0ab334eb4eb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__init__.py @@ -0,0 +1,5 @@ +from ray.train.torch.xla.config import TorchXLAConfig + +__all__ = [ + "TorchXLAConfig", +] diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..456f68323a5adb487aab3f89cbc4a91158e08eff Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b814a756106585ff5ddc2d962789f33338eeb3e1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/torch/xla/config.py b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/config.py new file mode 100644 index 0000000000000000000000000000000000000000..e965f9fc269acbc19dded2001051bf350ddf80f1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/torch/xla/config.py @@ -0,0 +1,169 @@ +import logging +import os +import re +import shutil +import uuid +from dataclasses import dataclass + +import ray +from ray.train._internal.utils import get_address_and_port +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend +from ray.train.torch import TorchConfig +from ray.util import PublicAPI + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="alpha") +@dataclass +class TorchXLAConfig(TorchConfig): + """ + Configuration for torch XLA setup. + See https://pytorch.org/xla/release/1.13/index.html for more info. + Currently, only "neuron_cores" accelerator (AwsNeuronXLABackend) + is supported with xrt runtime. + """ + + neuron_parallel_compile: bool = False + + @property + def backend_cls(self): + return _TorchAwsNeuronXLABackend + + +def _kill_xrt_server(): + import subprocess + + subprocess.call(["pkill", "-f", "xrt_run_server"]) + + +def _set_xla_env_vars(): + # https://pytorch.org/docs/1.13/elastic/run.html#environment-variables + context = ray.train.get_context() + + os.environ["LOCAL_RANK"] = str(context.get_local_rank()) + os.environ["RANK"] = str(context.get_world_rank()) + os.environ["LOCAL_WORLD_SIZE"] = str(context.get_local_world_size()) + os.environ["WORLD_SIZE"] = str(context.get_world_size()) + os.environ["GROUP_RANK"] = str(context.get_node_rank()) + os.environ["GROUP_WORLD_SIZE"] = str( + context.get_world_size() / context.get_local_world_size() + ) + os.environ["ROLE_RANK"] = str(context.get_world_rank()) + os.environ["ROLE_WORLD_RANK"] = str(context.get_world_rank()) + os.environ["ROLE_WORLD_SIZE"] = str(context.get_world_size()) + + # EFA and XLA setup + # https://github.com/aws/libfabric/blob/master/prov/efa/src/rxr/rxr_init.c + # https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128.sh # noqa + os.environ["FI_PROVIDER"] = "efa" + os.environ["FI_EFA_USE_DEVICE_RDMA"] = "1" + os.environ["FI_EFA_FORK_SAFE"] = "1" + os.environ["XLA_TRANSFER_SEED_ASYNC"] = "1" + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + + +def _setup_xla_torch_process_group(): + try: + import torch.distributed as dist + import torch_xla.core.xla_model as xm # noqa F401 + import torch_xla.distributed.xla_backend # noqa F401 + + dist.init_process_group("xla") + except ImportError: + raise ImportError("torch_xla must be installed to use torch_xla backend.") + + +# The following env vars enable Neuron graph extraction for parallel compilation +# Note: model outputs are invalid and should be ignored while these env vars are set +def _set_neuron_parallel_compile_env_vars(): + os.environ["NEURON_PARALLEL_COMPILE"] = "1" + os.environ["NEURON_EXTRACT_GRAPHS_ONLY"] = "1" + os.environ["NEURON_FALL_BACK_TO_NULL_NEFF"] = "1" + + +# Compile previously extracted Neuron graphs +def _neuron_compile_extracted_graphs(): + try: + from libneuronxla.neuron_cc_cache import CacheUrl + from libneuronxla.neuron_parallel_compile import parallel_compile + except ImportError: + raise ImportError( + "libneuronxla must be installed to use Neuron parallel compilation." + ) + + # Only 1 worker per node should run parallel_compile() + if os.environ.get("LOCAL_RANK") == "0": + logger.info("Compiling extracted graphs on local rank0 worker") + + parallel_compile_workdir = ( + f"/tmp/{os.environ.get('USER','no-user')}/parallel_compile_workdir/" + ) + if os.path.exists(parallel_compile_workdir): + shutil.rmtree(parallel_compile_workdir) + os.makedirs(parallel_compile_workdir, exist_ok=True) + + # Users can set the cache directory using --cache_dir in NEURON_CC_FLAGS or by + # using NEURON_COMPILE_CACHE_URL. --cache_dir takes precedence. + explicit_cache_dir = None + if neuron_cc_flags := os.environ.get("NEURON_CC_FLAGS"): + if s := re.search(r"--cache_dir[= ](\S+)", neuron_cc_flags): + explicit_cache_dir = s.group(1) + + parallel_compile( + parallel_compile_workdir, + CacheUrl.get_cache_url(explicit_cache_dir), + ) + + +class _TorchAwsNeuronXLABackend(Backend): + unique_run_id: str = str(uuid.uuid4()) + + def on_start(self, worker_group: WorkerGroup, backend_config: TorchXLAConfig): + """Logic ran right before training is started.""" + + # On previous worker failure, we don't run graceful shutdown on workers. + # This would leak any running xrt server. + worker_group.execute(_kill_xrt_server) + + # Get master address and port from the first worker. + master_addr, master_port = worker_group.execute_single(0, get_address_and_port) + + def set_env_vars(addr, port): + os.environ["MASTER_ADDR"] = addr + os.environ["MASTER_PORT"] = str(port) + # To trigger the xrt server + os.environ["TORCHELASTIC_RUN_ID"] = self.unique_run_id + + # Set the env vars on all workers. + worker_group.execute(set_env_vars, addr=master_addr, port=master_port) + + # Set up env vars for neuron parallel compilation graph extraction + if backend_config.neuron_parallel_compile: + logger.info("Extracting graphs for Neuron parallel compilation") + worker_group.execute(_set_neuron_parallel_compile_env_vars) + + def on_training_start( + self, worker_group: WorkerGroup, backend_config: TorchXLAConfig + ): + """ + Configure the environment variables for the worker group. + And initialize the xla distributed process group. + TODO: Current setup only supports homogenous cluster with + neuron_cores accelerator and xrt runtime. + """ + worker_group.execute(_set_xla_env_vars) + worker_group.execute(_setup_xla_torch_process_group) + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchXLAConfig): + """ + Logic ran right after training is finished. + This is a sanity cleanup to kill xrt server, and to optionally + run neuron parallel graph compilation + """ + worker_group.execute(_kill_xrt_server) + + # Compile the extracted graphs. This must run at end of training. + if backend_config.neuron_parallel_compile: + worker_group.execute(_neuron_compile_extracted_graphs) diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/lightgbm_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/lightgbm_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b5a3988bd457bf7349c9981c51e1dc604809c155 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/v2/lightgbm/lightgbm_trainer.py @@ -0,0 +1,154 @@ +import logging +from typing import Any, Callable, Dict, Optional, Union + +import ray.train +from ray.train import Checkpoint +from ray.train.lightgbm.config import LightGBMConfig, get_network_params # noqa +from ray.train.trainer import GenDataset +from ray.train.v2._internal.constants import _UNSUPPORTED +from ray.train.v2.api.config import RunConfig, ScalingConfig +from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer + +logger = logging.getLogger(__name__) + + +class LightGBMTrainer(DataParallelTrainer): + """A Trainer for distributed data-parallel LightGBM training. + + Example + ------- + + .. testcode:: + + import lightgbm as lgb + + import ray.data + import ray.train + from ray.train.lightgbm import RayTrainReportCallback + from ray.train.lightgbm.v2 import LightGBMTrainer + + + def train_fn_per_worker(config: dict): + # (Optional) Add logic to resume training state from a checkpoint. + # ray.train.get_checkpoint() + + # 1. Get the dataset shard for the worker and convert to a `lgb.Dataset` + train_ds_iter, eval_ds_iter = ( + ray.train.get_dataset_shard("train"), + ray.train.get_dataset_shard("validation"), + ) + train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize() + train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas() + train_X, train_y = train_df.drop("y", axis=1), train_df["y"] + eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"] + + train_set = lgb.Dataset(train_X, label=train_y) + eval_set = lgb.Dataset(eval_X, label=eval_y) + + # 2. Run distributed data-parallel training. + # `get_network_params` sets up the necessary configurations for LightGBM + # to set up the data parallel training worker group on your Ray cluster. + params = { + "objective": "regression", + # Adding the line below is the only change needed + # for your `lgb.train` call! + **ray.train.lightgbm.v2.get_network_params(), + } + lgb.train( + params, + train_set, + valid_sets=[eval_set], + valid_names=["eval"], + # To access the checkpoint from trainer, you need this callback. + callbacks=[RayTrainReportCallback()], + ) + + train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) + eval_ds = ray.data.from_items( + [{"x": x, "y": x + 1} for x in range(32, 32 + 16)] + ) + trainer = LightGBMTrainer( + train_fn_per_worker, + datasets={"train": train_ds, "validation": eval_ds}, + scaling_config=ray.train.ScalingConfig(num_workers=4), + ) + result = trainer.fit() + booster = RayTrainReportCallback.get_model(result.checkpoint) + + .. testoutput:: + :hide: + + ... + + Args: + train_loop_per_worker: The training function to execute on each worker. + This function can either take in zero arguments or a single ``Dict`` + argument which is set by defining ``train_loop_config``. + Within this function you can use any of the + :ref:`Ray Train Loop utilities `. + train_loop_config: A configuration ``Dict`` to pass in as an argument to + ``train_loop_per_worker``. + This is typically used for specifying hyperparameters. + lightgbm_config: The configuration for setting up the distributed lightgbm + backend. See :class:`~ray.train.lightgbm.LightGBMConfig` for more info. + scaling_config: The configuration for how to scale data parallel training. + ``num_workers`` determines how many Python processes are used for training, + and ``use_gpu`` determines whether or not each process should use GPUs. + See :class:`~ray.train.ScalingConfig` for more info. + run_config: The configuration for the execution of the training run. + See :class:`~ray.train.RunConfig` for more info. + datasets: The Ray Datasets to ingest for training. + Datasets are keyed by name (``{name: dataset}``). + Each dataset can be accessed from within the ``train_loop_per_worker`` + by calling ``ray.train.get_dataset_shard(name)``. + Sharding and additional configuration can be done by + passing in a ``dataset_config``. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Dataset are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + resume_from_checkpoint: A checkpoint to resume training from. + This checkpoint can be accessed from within ``train_loop_per_worker`` + by calling ``ray.train.get_checkpoint()``. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + lightgbm_config: Optional[LightGBMConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + dataset_config: Optional[ray.train.DataConfig] = None, + metadata: Optional[Dict[str, Any]] = _UNSUPPORTED, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + super(LightGBMTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=lightgbm_config or LightGBMConfig(), + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) + + @classmethod + def get_model( + cls, + checkpoint: Checkpoint, + ): + """Retrieve the LightGBM model stored in this checkpoint. + + This API is deprecated. Use `RayTrainReportCallback.get_model` instead. + """ + raise DeprecationWarning( + "`LightGBMTrainer.get_model` is deprecated. " + "Use `RayTrainReportCallback.get_model` instead." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/xgboost_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/xgboost_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..44efc6b2dfb7abd1eea105dad6e2e07b0e8d094a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/v2/xgboost/xgboost_trainer.py @@ -0,0 +1,154 @@ +import logging +from typing import Any, Callable, Dict, Optional, Union + +import ray.train +from ray.train import Checkpoint +from ray.train.trainer import GenDataset +from ray.train.v2._internal.constants import _UNSUPPORTED +from ray.train.v2.api.config import RunConfig, ScalingConfig +from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer +from ray.train.xgboost import XGBoostConfig + +logger = logging.getLogger(__name__) + + +class XGBoostTrainer(DataParallelTrainer): + """A Trainer for distributed data-parallel XGBoost training. + + Example + ------- + + .. testcode:: + + import xgboost + + import ray.data + import ray.train + from ray.train.xgboost import RayTrainReportCallback + from ray.train.xgboost import XGBoostTrainer + + def train_fn_per_worker(config: dict): + # (Optional) Add logic to resume training state from a checkpoint. + # ray.train.get_checkpoint() + + # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix` + train_ds_iter, eval_ds_iter = ( + ray.train.get_dataset_shard("train"), + ray.train.get_dataset_shard("validation"), + ) + train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize() + + train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas() + train_X, train_y = train_df.drop("y", axis=1), train_df["y"] + eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"] + + dtrain = xgboost.DMatrix(train_X, label=train_y) + deval = xgboost.DMatrix(eval_X, label=eval_y) + + params = { + "tree_method": "approx", + "objective": "reg:squarederror", + "eta": 1e-4, + "subsample": 0.5, + "max_depth": 2, + } + + # 2. Do distributed data-parallel training. + # Ray Train sets up the necessary coordinator processes and + # environment variables for your workers to communicate with each other. + bst = xgboost.train( + params, + dtrain=dtrain, + evals=[(deval, "validation")], + num_boost_round=10, + callbacks=[RayTrainReportCallback()], + ) + + train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) + eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)]) + trainer = XGBoostTrainer( + train_fn_per_worker, + datasets={"train": train_ds, "validation": eval_ds}, + scaling_config=ray.train.ScalingConfig(num_workers=4), + ) + result = trainer.fit() + booster = RayTrainReportCallback.get_model(result.checkpoint) + + .. testoutput:: + :hide: + + ... + + Args: + train_loop_per_worker: The training function to execute on each worker. + This function can either take in zero arguments or a single ``Dict`` + argument which is set by defining ``train_loop_config``. + Within this function you can use any of the + :ref:`Ray Train Loop utilities `. + train_loop_config: A configuration ``Dict`` to pass in as an argument to + ``train_loop_per_worker``. + This is typically used for specifying hyperparameters. + xgboost_config: The configuration for setting up the distributed xgboost + backend. Defaults to using the "rabit" backend. + See :class:`~ray.train.xgboost.XGBoostConfig` for more info. + scaling_config: The configuration for how to scale data parallel training. + ``num_workers`` determines how many Python processes are used for training, + and ``use_gpu`` determines whether or not each process should use GPUs. + See :class:`~ray.train.ScalingConfig` for more info. + run_config: The configuration for the execution of the training run. + See :class:`~ray.train.RunConfig` for more info. + datasets: The Ray Datasets to ingest for training. + Datasets are keyed by name (``{name: dataset}``). + Each dataset can be accessed from within the ``train_loop_per_worker`` + by calling ``ray.train.get_dataset_shard(name)``. + Sharding and additional configuration can be done by + passing in a ``dataset_config``. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Dataset are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + resume_from_checkpoint: A checkpoint to resume training from. + This checkpoint can be accessed from within ``train_loop_per_worker`` + by calling ``ray.train.get_checkpoint()``. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + xgboost_config: Optional[XGBoostConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + dataset_config: Optional[ray.train.DataConfig] = None, + metadata: Optional[Dict[str, Any]] = _UNSUPPORTED, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + super(XGBoostTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=xgboost_config or XGBoostConfig(), + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) + + @classmethod + def get_model( + cls, + checkpoint: Checkpoint, + ): + """Retrieve the XGBoost model stored in this checkpoint. + + This API is deprecated. Use `RayTrainReportCallback.get_model` instead. + """ + raise DeprecationWarning( + "`XGBoostTrainer.get_model` is deprecated. " + "Use `RayTrainReportCallback.get_model` instead." + )