koichi12 commited on Feb 12, 2025

Commit

e0fc633

verified ·

1 Parent(s): cf6a8b4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_agent.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_consts.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/node_consts.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/node_consts.py +17 -0
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/node_head.py +496 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/actor_group.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/actor_pool.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/annotations.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/check_open_ports.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/check_serialize.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/client_connect.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/debug.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/debugpy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/iter.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/iter_metrics.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/metrics.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/placement_group.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/queue.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/rpdb.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/scheduling_strategies.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/serialization.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/serialization_addons.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/__pycache__/timer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/accelerators/__init__.py +78 -0
.venv/lib/python3.11/site-packages/ray/util/accelerators/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/accelerators/__pycache__/accelerators.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/accelerators/__pycache__/tpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/accelerators/accelerators.py +33 -0
.venv/lib/python3.11/site-packages/ray/util/accelerators/tpu.py +39 -0
.venv/lib/python3.11/site-packages/ray/util/annotations.py +268 -0
.venv/lib/python3.11/site-packages/ray/util/client/api.py +406 -0
.venv/lib/python3.11/site-packages/ray/util/client/client_app.py +90 -0
.venv/lib/python3.11/site-packages/ray/util/client/common.py +956 -0
.venv/lib/python3.11/site-packages/ray/util/client/dataclient.py +599 -0
.venv/lib/python3.11/site-packages/ray/util/client/options.py +47 -0
.venv/lib/python3.11/site-packages/ray/util/client/ray_client_helpers.py +115 -0
.venv/lib/python3.11/site-packages/ray/util/client/runtime_context.py +65 -0
.venv/lib/python3.11/site-packages/ray/util/client/worker.py +908 -0
.venv/lib/python3.11/site-packages/ray/util/dask/__init__.py +63 -0
.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/callbacks.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/optimizations.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/scheduler.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (198 Bytes). View file

.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_agent.cpython-311.pyc ADDED Viewed

Binary file (16.9 kB). View file

.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_consts.cpython-311.pyc ADDED Viewed

Binary file (340 Bytes). View file

.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_manager.cpython-311.pyc ADDED Viewed

Binary file (19.9 kB). View file

.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (199 Bytes). View file

.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/node_consts.cpython-311.pyc ADDED Viewed

Binary file (824 Bytes). View file

.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/node_consts.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from ray._private.ray_constants import env_integer
+NODE_STATS_UPDATE_INTERVAL_SECONDS = env_integer(
+    "NODE_STATS_UPDATE_INTERVAL_SECONDS", 15
+)
+RAY_DASHBOARD_HEAD_NODE_REGISTRATION_TIMEOUT = env_integer(
+    "RAY_DASHBOARD_HEAD_NODE_REGISTRATION_TIMEOUT", 10
+)
+MAX_COUNT_OF_GCS_RPC_ERROR = 10
+# This is consistent with gcs_node_manager.cc
+MAX_DEAD_NODES_TO_CACHE = env_integer("RAY_maximum_gcs_dead_node_cached_count", 1000)
+RAY_DASHBOARD_NODE_SUBSCRIBER_POLL_SIZE = env_integer(
+    "RAY_DASHBOARD_NODE_SUBSCRIBER_POLL_SIZE", 200
+)
+RAY_DASHBOARD_AGENT_POLL_INTERVAL_S = env_integer(
+    "RAY_DASHBOARD_AGENT_POLL_INTERVAL_S", 1
+)

.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/node_head.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import asyncio
+import json
+import logging
+import time
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from itertools import chain
+from typing import AsyncGenerator, Iterable, List
+import aiohttp.web
+import grpc
+import ray._private.utils
+import ray.dashboard.consts as dashboard_consts
+import ray.dashboard.optional_utils as dashboard_optional_utils
+import ray.dashboard.utils as dashboard_utils
+from ray._private import ray_constants
+from ray._private.collections_utils import split
+from ray._private.gcs_pubsub import GcsAioNodeInfoSubscriber
+from ray._private.ray_constants import (
+    DEBUG_AUTOSCALING_ERROR,
+    DEBUG_AUTOSCALING_STATUS,
+    env_integer,
+)
+from ray._private.gcs_pubsub import GcsAioResourceUsageSubscriber
+from ray._private.utils import get_or_create_event_loop
+from ray.autoscaler._private.util import (
+    LoadMetricsSummary,
+    get_per_node_breakdown_as_dict,
+    parse_usage,
+)
+from ray.core.generated import gcs_pb2, node_manager_pb2, node_manager_pb2_grpc
+from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS
+from ray.dashboard.datacenter import DataOrganizer, DataSource
+from ray.dashboard.modules.node import node_consts
+from ray.dashboard.modules.node.node_consts import (
+    RAY_DASHBOARD_HEAD_NODE_REGISTRATION_TIMEOUT,
+)
+from ray.dashboard.utils import async_loop_forever
+logger = logging.getLogger(__name__)
+routes = dashboard_optional_utils.DashboardHeadRouteTable
+# NOTE: Executor in this head is intentionally constrained to just 1 thread by
+#       default to limit its concurrency, therefore reducing potential for
+#       GIL contention
+RAY_DASHBOARD_NODE_HEAD_TPE_MAX_WORKERS = env_integer(
+    "RAY_DASHBOARD_NODE_HEAD_TPE_MAX_WORKERS", 1
+)
+def _gcs_node_info_to_dict(message: gcs_pb2.GcsNodeInfo) -> dict:
+    return dashboard_utils.message_to_dict(
+        message, {"nodeId"}, always_print_fields_with_no_presence=True
+    )
+def node_stats_to_dict(message):
+    decode_keys = {
+        "actorId",
+        "jobId",
+        "taskId",
+        "parentTaskId",
+        "sourceActorId",
+        "callerId",
+        "rayletId",
+        "workerId",
+        "placementGroupId",
+    }
+    core_workers_stats = message.core_workers_stats
+    message.ClearField("core_workers_stats")
+    try:
+        result = dashboard_utils.message_to_dict(message, decode_keys)
+        result["coreWorkersStats"] = [
+            dashboard_utils.message_to_dict(
+                m, decode_keys, always_print_fields_with_no_presence=True
+            )
+            for m in core_workers_stats
+        ]
+        return result
+    finally:
+        message.core_workers_stats.extend(core_workers_stats)
+class NodeHead(dashboard_utils.DashboardHeadModule):
+    def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
+        super().__init__(config)
+        self._stubs = {}
+        self._collect_memory_info = False
+        DataSource.nodes.signal.append(self._update_stubs)
+        # The time where the module is started.
+        self._module_start_time = time.time()
+        # The time it takes until the head node is registered. None means
+        # head node hasn't been registered.
+        self._head_node_registration_time_s = None
+        # Queue of dead nodes to be removed, up to MAX_DEAD_NODES_TO_CACHE
+        self._dead_node_queue = deque()
+        self._executor = ThreadPoolExecutor(
+            max_workers=RAY_DASHBOARD_NODE_HEAD_TPE_MAX_WORKERS,
+            thread_name_prefix="node_head_executor",
+        )
+    async def _update_stubs(self, change):
+        if change.old:
+            node_id, node_info = change.old
+            self._stubs.pop(node_id, None)
+        if change.new:
+            # TODO(fyrestone): Handle exceptions.
+            node_id, node_info = change.new
+            address = "{}:{}".format(
+                node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])
+            )
+            options = ray_constants.GLOBAL_GRPC_OPTIONS
+            channel = ray._private.utils.init_grpc_channel(
+                address, options, asynchronous=True
+            )
+            stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
+            self._stubs[node_id] = stub
+    def get_internal_states(self):
+        return {
+            "head_node_registration_time_s": self._head_node_registration_time_s,
+            "registered_nodes": len(DataSource.nodes),
+            "registered_agents": len(DataSource.agents),
+            "module_lifetime_s": time.time() - self._module_start_time,
+        }
+    async def _subscribe_for_node_updates(self) -> AsyncGenerator[dict, None]:
+        """
+        Yields the initial state of all nodes, then yields the updated state of nodes.
+        It makes GetAllNodeInfo call only once after the subscription is done, to get
+        the initial state of the nodes.
+        """
+        subscriber = GcsAioNodeInfoSubscriber(address=self.gcs_address)
+        await subscriber.subscribe()
+        # Get all node info from GCS. To prevent Time-of-check to time-of-use issue [1],
+        # it happens after the subscription. That is, an update between
+        # get-all-node-info and the subscription is not missed.
+        # [1] https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use
+        all_node_info = await self.gcs_aio_client.get_all_node_info(timeout=None)
+        def _convert_to_dict(messages: Iterable[gcs_pb2.GcsNodeInfo]) -> List[dict]:
+            return [_gcs_node_info_to_dict(m) for m in messages]
+        all_node_infos = await get_or_create_event_loop().run_in_executor(
+            self._executor,
+            _convert_to_dict,
+            all_node_info.values(),
+        )
+        for node in all_node_infos:
+            yield node
+        while True:
+            try:
+                node_id_updated_info_tuples = await subscriber.poll(
+                    batch_size=node_consts.RAY_DASHBOARD_NODE_SUBSCRIBER_POLL_SIZE
+                )
+                if node_id_updated_info_tuples:
+                    _, updated_infos_proto = zip(*node_id_updated_info_tuples)
+                else:
+                    updated_infos_proto = []
+                updated_infos = await get_or_create_event_loop().run_in_executor(
+                    self._executor,
+                    _convert_to_dict,
+                    updated_infos_proto,
+                )
+                for node in updated_infos:
+                    yield node
+            except Exception:
+                logger.exception("Failed handling updated nodes.")
+    async def _update_node(self, node: dict):
+        node_id = node["nodeId"]  # hex
+        if node["isHeadNode"] and not self._head_node_registration_time_s:
+            self._head_node_registration_time_s = time.time() - self._module_start_time
+            # Put head node ID in the internal KV to be read by JobAgent.
+            # TODO(architkulkarni): Remove once State API exposes which
+            # node is the head node.
+            await self.gcs_aio_client.internal_kv_put(
+                ray_constants.KV_HEAD_NODE_ID_KEY,
+                node_id.encode(),
+                overwrite=True,
+                namespace=ray_constants.KV_NAMESPACE_JOB,
+                timeout=GCS_RPC_TIMEOUT_SECONDS,
+            )
+        assert node["state"] in ["ALIVE", "DEAD"]
+        is_alive = node["state"] == "ALIVE"
+        # Prepare agents for alive node, and pop agents for dead node.
+        if is_alive:
+            if node_id not in DataSource.agents:
+                # Agent port is read from internal KV, which is only populated
+                # upon Agent startup. In case this update received before agent
+                # fully started up, we schedule a task to asynchronously update
+                # DataSource with appropriate agent port.
+                asyncio.create_task(self._update_agent(node_id))
+        else:
+            DataSource.agents.pop(node_id, None)
+            self._dead_node_queue.append(node_id)
+            if len(self._dead_node_queue) > node_consts.MAX_DEAD_NODES_TO_CACHE:
+                DataSource.nodes.pop(self._dead_node_queue.popleft(), None)
+        DataSource.nodes[node_id] = node
+    async def _update_agent(self, node_id):
+        """
+        Given a node, update the agent_port in DataSource.agents. Problem is it's not
+        present until agent.py starts, so we need to loop waiting for agent.py writes
+        its port to internal kv.
+        """
+        key = (
+            f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{node_id}".encode()
+        )
+        while True:
+            try:
+                agent_addr = await self.gcs_aio_client.internal_kv_get(
+                    key,
+                    namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
+                    timeout=None,
+                )
+                # The node may be dead already. Only update DataSource.agents if the
+                # node is still alive.
+                if DataSource.nodes.get(node_id, {}).get("state") != "ALIVE":
+                    return
+                if agent_addr:
+                    DataSource.agents[node_id] = json.loads(agent_addr)
+                    return
+            except Exception:
+                logger.exception(f"Error getting agent port for node {node_id}.")
+            await asyncio.sleep(node_consts.RAY_DASHBOARD_AGENT_POLL_INTERVAL_S)
+    async def _update_nodes(self):
+        """
+        Subscribe to node updates and update the internal states. If the head node is
+        not registered after RAY_DASHBOARD_HEAD_NODE_REGISTRATION_TIMEOUT, it logs a
+        warning only once.
+        """
+        warning_shown = False
+        async for node in self._subscribe_for_node_updates():
+            await self._update_node(node)
+            if not self._head_node_registration_time_s:
+                # head node is not registered yet
+                if (
+                    not warning_shown
+                    and (time.time() - self._module_start_time)
+                    > RAY_DASHBOARD_HEAD_NODE_REGISTRATION_TIMEOUT
+                ):
+                    logger.warning(
+                        "Head node is not registered even after "
+                        f"{RAY_DASHBOARD_HEAD_NODE_REGISTRATION_TIMEOUT} seconds. "
+                        "The API server might not work correctly. Please "
+                        "report a Github issue. Internal states :"
+                        f"{self.get_internal_states()}"
+                    )
+                    warning_shown = True
+    @routes.get("/internal/node_module")
+    async def get_node_module_internal_state(self, req) -> aiohttp.web.Response:
+        return dashboard_optional_utils.rest_response(
+            success=True,
+            message="",
+            **self.get_internal_states(),
+        )
+    async def get_nodes_logical_resources(self) -> dict:
+        from ray.autoscaler.v2.utils import is_autoscaler_v2
+        if is_autoscaler_v2():
+            from ray.autoscaler.v2.sdk import get_cluster_status
+            try:
+                cluster_status = get_cluster_status(self.gcs_address)
+            except Exception:
+                logger.exception("Error getting cluster status")
+                return {}
+            per_node_resources = {}
+            # TODO(rickyx): we should just return structure data rather than strings.
+            for node in chain(cluster_status.active_nodes, cluster_status.idle_nodes):
+                if not node.resource_usage:
+                    continue
+                usage_dict = {
+                    r.resource_name: (r.used, r.total)
+                    for r in node.resource_usage.usage
+                }
+                per_node_resources[node.node_id] = "\n".join(
+                    parse_usage(usage_dict, verbose=True)
+                )
+            return per_node_resources
+        # Legacy autoscaler status code.
+        (status_string, error) = await asyncio.gather(
+            *[
+                self.gcs_aio_client.internal_kv_get(
+                    key.encode(), namespace=None, timeout=GCS_RPC_TIMEOUT_SECONDS
+                )
+                for key in [
+                    DEBUG_AUTOSCALING_STATUS,
+                    DEBUG_AUTOSCALING_ERROR,
+                ]
+            ]
+        )
+        if not status_string:
+            return {}
+        status_dict = json.loads(status_string)
+        lm_summary_dict = status_dict.get("load_metrics_report")
+        if lm_summary_dict:
+            lm_summary = LoadMetricsSummary(**lm_summary_dict)
+        node_logical_resources = get_per_node_breakdown_as_dict(lm_summary)
+        return node_logical_resources if error is None else {}
+    @routes.get("/nodes")
+    @dashboard_optional_utils.aiohttp_cache
+    async def get_all_nodes(self, req) -> aiohttp.web.Response:
+        view = req.query.get("view")
+        if view == "summary":
+            all_node_summary_task = DataOrganizer.get_all_node_summary()
+            nodes_logical_resource_task = self.get_nodes_logical_resources()
+            all_node_summary, nodes_logical_resources = await asyncio.gather(
+                all_node_summary_task, nodes_logical_resource_task
+            )
+            return dashboard_optional_utils.rest_response(
+                success=True,
+                message="Node summary fetched.",
+                summary=all_node_summary,
+                node_logical_resources=nodes_logical_resources,
+            )
+        elif view is not None and view.lower() == "hostNameList".lower():
+            alive_hostnames = set()
+            for node in DataSource.nodes.values():
+                if node["state"] == "ALIVE":
+                    alive_hostnames.add(node["nodeManagerHostname"])
+            return dashboard_optional_utils.rest_response(
+                success=True,
+                message="Node hostname list fetched.",
+                host_name_list=list(alive_hostnames),
+            )
+        else:
+            return dashboard_optional_utils.rest_response(
+                success=False, message=f"Unknown view {view}"
+            )
+    @routes.get("/nodes/{node_id}")
+    @dashboard_optional_utils.aiohttp_cache
+    async def get_node(self, req) -> aiohttp.web.Response:
+        node_id = req.match_info.get("node_id")
+        node_info = await DataOrganizer.get_node_info(node_id)
+        return dashboard_optional_utils.rest_response(
+            success=True, message="Node details fetched.", detail=node_info
+        )
+    @async_loop_forever(node_consts.NODE_STATS_UPDATE_INTERVAL_SECONDS)
+    async def _update_node_stats(self):
+        timeout = max(2, node_consts.NODE_STATS_UPDATE_INTERVAL_SECONDS - 1)
+        # NOTE: We copy stubs to make sure
+        #       it doesn't change during the iteration (since its being updated
+        #       from another async task)
+        current_stub_node_id_tuples = list(self._stubs.items())
+        node_ids = []
+        get_node_stats_tasks = []
+        for _, (node_id, stub) in enumerate(current_stub_node_id_tuples):
+            node_info = DataSource.nodes.get(node_id)
+            if node_info["state"] != "ALIVE":
+                continue
+            node_ids.append(node_id)
+            get_node_stats_tasks.append(
+                stub.GetNodeStats(
+                    node_manager_pb2.GetNodeStatsRequest(
+                        include_memory_info=self._collect_memory_info
+                    ),
+                    timeout=timeout,
+                )
+            )
+        responses = []
+        # NOTE: We're chunking up fetching of the stats to run in batches of no more
+        #       than 100 nodes at a time to avoid flooding the event-loop's queue
+        #       with potentially a large, uninterrupted sequence of tasks updating
+        #       the node stats for very large clusters.
+        for get_node_stats_tasks_chunk in split(get_node_stats_tasks, 100):
+            current_chunk_responses = await asyncio.gather(
+                *get_node_stats_tasks_chunk,
+                return_exceptions=True,
+            )
+            responses.extend(current_chunk_responses)
+            # We're doing short (25ms) yield after every chunk to make sure
+            #   - We're not overloading the event-loop with excessive # of tasks
+            #   - Allowing 10k nodes stats fetches be sent out performed in 2.5s
+            await asyncio.sleep(0.025)
+        def postprocess(node_id_response_tuples):
+            """Pure function reorganizing the data into {node_id: stats}."""
+            new_node_stats = {}
+            for node_id, response in node_id_response_tuples:
+                if isinstance(response, asyncio.CancelledError):
+                    pass
+                elif isinstance(response, grpc.RpcError):
+                    if response.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
+                        message = (
+                            f"Cannot reach the node, {node_id}, after timeout "
+                            f" {timeout}. This node may have been overloaded, "
+                            "terminated, or the network is slow."
+                        )
+                    elif response.code() == grpc.StatusCode.UNAVAILABLE:
+                        message = (
+                            f"Cannot reach the node, {node_id}. "
+                            "The node may have been terminated."
+                        )
+                    else:
+                        message = f"Error updating node stats of {node_id}."
+                    logger.error(message, exc_info=response)
+                elif isinstance(response, Exception):
+                    logger.error(
+                        f"Error updating node stats of {node_id}.", exc_info=response
+                    )
+                else:
+                    new_node_stats[node_id] = node_stats_to_dict(response)
+            return new_node_stats
+        # NOTE: Zip will silently truncate to shorter argument that potentially
+        #       could lead to subtle hard to catch issues, hence the assertion
+        assert len(node_ids) == len(responses)
+        new_node_stats = await get_or_create_event_loop().run_in_executor(
+            self._executor, postprocess, zip(node_ids, responses)
+        )
+        for node_id, new_stat in new_node_stats.items():
+            DataSource.node_stats[node_id] = new_stat
+    async def _update_node_physical_stats(self):
+        """
+        Update DataSource.node_physical_stats by subscribing to the GCS resource usage.
+        """
+        subscriber = GcsAioResourceUsageSubscriber(address=self.gcs_address)
+        await subscriber.subscribe()
+        loop = get_or_create_event_loop()
+        while True:
+            try:
+                # The key is b'RAY_REPORTER:{node id hex}',
+                # e.g. b'RAY_REPORTER:2b4fbd...'
+                key, data = await subscriber.poll()
+                if key is None:
+                    continue
+                # NOTE: Every iteration is executed inside the thread-pool executor
+                #       (TPE) to avoid blocking the Dashboard's event-loop
+                parsed_data = await loop.run_in_executor(
+                    self._executor, json.loads, data
+                )
+                node_id = key.split(":")[-1]
+                DataSource.node_physical_stats[node_id] = parsed_data
+            except Exception:
+                logger.exception(
+                    "Error receiving node physical stats from _update_node_physical_stats."
+                )
+    async def run(self, server):
+        await asyncio.gather(
+            self._update_nodes(),
+            self._update_node_stats(),
+            self._update_node_physical_stats(),
+        )
+    @staticmethod
+    def is_minimal_module():
+        return False

.venv/lib/python3.11/site-packages/ray/util/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.33 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/actor_group.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/actor_pool.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/annotations.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/check_open_ports.cpython-311.pyc ADDED Viewed

Binary file (9.19 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/check_serialize.cpython-311.pyc ADDED Viewed

Binary file (11.6 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/client_connect.cpython-311.pyc ADDED Viewed

Binary file (3.37 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/debug.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/debugpy.cpython-311.pyc ADDED Viewed

Binary file (6.59 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/iter.cpython-311.pyc ADDED Viewed

Binary file (65.9 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/iter_metrics.cpython-311.pyc ADDED Viewed

Binary file (3.94 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/metrics.cpython-311.pyc ADDED Viewed

Binary file (14.6 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/placement_group.cpython-311.pyc ADDED Viewed

Binary file (24.2 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/queue.cpython-311.pyc ADDED Viewed

Binary file (16.5 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/rpdb.cpython-311.pyc ADDED Viewed

Binary file (19.5 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/scheduling_strategies.cpython-311.pyc ADDED Viewed

Binary file (9.84 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/serialization.cpython-311.pyc ADDED Viewed

Binary file (3.52 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/serialization_addons.cpython-311.pyc ADDED Viewed

Binary file (1.96 kB). View file

.venv/lib/python3.11/site-packages/ray/util/__pycache__/timer.cpython-311.pyc ADDED Viewed

Binary file (4.12 kB). View file

.venv/lib/python3.11/site-packages/ray/util/accelerators/__init__.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import warnings
+from ray.util.accelerators import tpu
+from ray.util.accelerators.accelerators import (
+    NVIDIA_TESLA_V100,
+    NVIDIA_TESLA_P100,
+    NVIDIA_TESLA_T4,
+    NVIDIA_TESLA_P4,
+    NVIDIA_TESLA_K80,
+    NVIDIA_TESLA_A10G,
+    NVIDIA_L4,
+    NVIDIA_A100,
+    NVIDIA_H100,
+    INTEL_MAX_1550,
+    INTEL_MAX_1100,
+    INTEL_GAUDI,
+    AMD_INSTINCT_MI100,
+    AMD_INSTINCT_MI210,
+    AMD_INSTINCT_MI250,
+    AMD_INSTINCT_MI250x,
+    AMD_INSTINCT_MI300x,
+    AMD_RADEON_R9_200_HD_7900,
+    AMD_RADEON_HD_7900,
+    AWS_NEURON_CORE,
+    GOOGLE_TPU_V2,
+    GOOGLE_TPU_V3,
+    GOOGLE_TPU_V4,
+    GOOGLE_TPU_V5P,
+    GOOGLE_TPU_V5LITEPOD,
+    GOOGLE_TPU_V6E,
+)
+__all__ = [
+    "tpu",
+    "NVIDIA_TESLA_V100",
+    "NVIDIA_TESLA_P100",
+    "NVIDIA_TESLA_T4",
+    "NVIDIA_TESLA_P4",
+    "NVIDIA_TESLA_K80",
+    "NVIDIA_TESLA_A10G",
+    "NVIDIA_L4",
+    "NVIDIA_A100",
+    "NVIDIA_A100_40G",
+    "NVIDIA_A100_80G",
+    "NVIDIA_H100",
+    "INTEL_MAX_1550",
+    "INTEL_MAX_1100",
+    "INTEL_GAUDI",
+    "AMD_INSTINCT_MI100",
+    "AMD_INSTINCT_MI210",
+    "AMD_INSTINCT_MI250",
+    "AMD_INSTINCT_MI250x",
+    "AMD_INSTINCT_MI300x",
+    "AMD_RADEON_R9_200_HD_7900",
+    "AMD_RADEON_HD_7900",
+    "AWS_NEURON_CORE",
+    "GOOGLE_TPU_V2",
+    "GOOGLE_TPU_V3",
+    "GOOGLE_TPU_V4",
+    "GOOGLE_TPU_V5P",
+    "GOOGLE_TPU_V5LITEPOD",
+    "GOOGLE_TPU_V6E",
+    # Deprecated
+    "NVIDIA_TESLA_A100",
+]
+def __getattr__(name: str):
+    if name == "NVIDIA_TESLA_A100":
+        from ray.util.annotations import RayDeprecationWarning
+        warnings.warn(
+            "NVIDIA_TESLA_A100 is deprecated, use NVIDIA_A100 instead.",
+            RayDeprecationWarning,
+            stacklevel=2,
+        )
+        return NVIDIA_A100
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

.venv/lib/python3.11/site-packages/ray/util/accelerators/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.13 kB). View file

.venv/lib/python3.11/site-packages/ray/util/accelerators/__pycache__/accelerators.cpython-311.pyc ADDED Viewed

Binary file (1.33 kB). View file

.venv/lib/python3.11/site-packages/ray/util/accelerators/__pycache__/tpu.cpython-311.pyc ADDED Viewed

Binary file (1.95 kB). View file

.venv/lib/python3.11/site-packages/ray/util/accelerators/accelerators.py ADDED Viewed

	@@ -0,0 +1,33 @@

+NVIDIA_TESLA_V100 = "V100"
+NVIDIA_TESLA_P100 = "P100"
+NVIDIA_TESLA_T4 = "T4"
+NVIDIA_TESLA_P4 = "P4"
+NVIDIA_TESLA_K80 = "K80"
+NVIDIA_TESLA_A10G = "A10G"
+NVIDIA_L4 = "L4"
+NVIDIA_L40S = "L40S"
+NVIDIA_A100 = "A100"
+NVIDIA_H100 = "H100"
+INTEL_MAX_1550 = "Intel-GPU-Max-1550"
+INTEL_MAX_1100 = "Intel-GPU-Max-1100"
+INTEL_GAUDI = "Intel-GAUDI"
+AMD_INSTINCT_MI100 = "AMD-Instinct-MI100"
+AMD_INSTINCT_MI250x = "AMD-Instinct-MI250X"
+AMD_INSTINCT_MI250 = "AMD-Instinct-MI250X-MI250"
+AMD_INSTINCT_MI210 = "AMD-Instinct-MI210"
+AMD_INSTINCT_MI300x = "AMD-Instinct-MI300X-OAM"
+AMD_RADEON_R9_200_HD_7900 = "AMD-Radeon-R9-200-HD-7900"
+AMD_RADEON_HD_7900 = "AMD-Radeon-HD-7900"
+AWS_NEURON_CORE = "aws-neuron-core"
+GOOGLE_TPU_V2 = "TPU-V2"
+GOOGLE_TPU_V3 = "TPU-V3"
+GOOGLE_TPU_V4 = "TPU-V4"
+GOOGLE_TPU_V5P = "TPU-V5P"
+GOOGLE_TPU_V5LITEPOD = "TPU-V5LITEPOD"
+GOOGLE_TPU_V6E = "TPU-V6E"
+# Use these instead of NVIDIA_A100 if you need a specific accelerator size. Note that
+# these labels are not auto-added to nodes, you'll have to add them manually in
+# addition to the default A100 label if needed.
+NVIDIA_A100_40G = "A100-40G"
+NVIDIA_A100_80G = "A100-80G"

.venv/lib/python3.11/site-packages/ray/util/accelerators/tpu.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import Optional
+from ray._private.accelerators import TPUAcceleratorManager
+from ray.util.annotations import PublicAPI
+@PublicAPI(stability="alpha")
+def get_current_pod_name() -> Optional[str]:
+    """
+    Return the name of the TPU pod that the worker is a part of.
+    Returns:
+        The name of the TPU pod. Returns None if not part of a TPU pod.
+    """
+    tpu_name = TPUAcceleratorManager.get_current_node_tpu_name()
+    if tpu_name == "":
+        tpu_name = None
+    return tpu_name
+@PublicAPI(stability="alpha")
+def get_current_pod_worker_count() -> Optional[int]:
+    """
+    Count the number of workers associated with the TPU pod that the worker belongs to.
+    Returns:
+        The total number of workers in the TPU pod. Returns None if the worker is not
+        part of a TPU pod.
+    """
+    return TPUAcceleratorManager.get_num_workers_in_current_tpu_pod()
+@PublicAPI(stablity="alpha")
+def get_num_tpu_chips_on_node() -> int:
+    """
+    Return the number of TPU chips on the node.
+    Returns:
+        The total number of chips on the TPU node. Returns 0 if none are found.
+    """
+    return TPUAcceleratorManager.get_current_node_num_accelerators()

.venv/lib/python3.11/site-packages/ray/util/annotations.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from enum import Enum
+from typing import Optional
+import inspect
+import sys
+import warnings
+from functools import wraps
+class AnnotationType(Enum):
+    PUBLIC_API = "PublicAPI"
+    DEVELOPER_API = "DeveloperAPI"
+    DEPRECATED = "Deprecated"
+    UNKNOWN = "Unknown"
+def PublicAPI(*args, **kwargs):
+    """Annotation for documenting public APIs.
+    Public APIs are classes and methods exposed to end users of Ray.
+    If ``stability="alpha"``, the API can be used by advanced users who are
+    tolerant to and expect breaking changes.
+    If ``stability="beta"``, the API is still public and can be used by early
+    users, but are subject to change.
+    If ``stability="stable"``, the APIs will remain backwards compatible across
+    minor Ray releases (e.g., Ray 1.4 -> 1.8).
+    For a full definition of the stability levels, please refer to the
+    :ref:`Ray API Stability definitions <api-stability>`.
+    Args:
+        stability: One of {"stable", "beta", "alpha"}.
+        api_group: Optional. Used only for doc rendering purpose. APIs in the same group
+                   will be grouped together in the API doc pages.
+    Examples:
+        >>> from ray.util.annotations import PublicAPI
+        >>> @PublicAPI
+        ... def func(x):
+        ...     return x
+        >>> @PublicAPI(stability="beta")
+        ... def func(y):
+        ...     return y
+    """
+    if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+        return PublicAPI(stability="stable", api_group="Others")(args[0])
+    if "stability" in kwargs:
+        stability = kwargs["stability"]
+        assert stability in ["stable", "beta", "alpha"], stability
+    else:
+        stability = "stable"
+    api_group = kwargs.get("api_group", "Others")
+    def wrap(obj):
+        if stability in ["alpha", "beta"]:
+            message = (
+                f"**PublicAPI ({stability}):** This API is in {stability} "
+                "and may change before becoming stable."
+            )
+            _append_doc(obj, message=message)
+        _mark_annotated(obj, type=AnnotationType.PUBLIC_API, api_group=api_group)
+        return obj
+    return wrap
+def DeveloperAPI(*args, **kwargs):
+    """Annotation for documenting developer APIs.
+    Developer APIs are lower-level methods explicitly exposed to advanced Ray
+    users and library developers. Their interfaces may change across minor
+    Ray releases.
+    Examples:
+        >>> from ray.util.annotations import DeveloperAPI
+        >>> @DeveloperAPI
+        ... def func(x):
+        ...     return x
+    """
+    if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+        return DeveloperAPI()(args[0])
+    def wrap(obj):
+        _append_doc(
+            obj,
+            message="**DeveloperAPI:** This API may change across minor Ray releases.",
+        )
+        _mark_annotated(obj, type=AnnotationType.DEVELOPER_API)
+        return obj
+    return wrap
+class RayDeprecationWarning(DeprecationWarning):
+    """Specialized Deprecation Warning for fine grained filtering control"""
+    pass
+# By default, print the first occurrence of matching warnings for
+# each module where the warning is issued (regardless of line number)
+if not sys.warnoptions:
+    warnings.filterwarnings("module", category=RayDeprecationWarning)
+def Deprecated(*args, **kwargs):
+    """Annotation for documenting a deprecated API.
+    Deprecated APIs may be removed in future releases of Ray.
+    Args:
+        message: a message to help users understand the reason for the
+            deprecation, and provide a migration path.
+    Examples:
+        >>> from ray.util.annotations import Deprecated
+        >>> @Deprecated
+        ... def func(x):
+        ...     return x
+        >>> @Deprecated(message="g() is deprecated because the API is error "
+        ...   "prone. Please call h() instead.")
+        ... def g(y):
+        ...     return y
+    """
+    if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+        return Deprecated()(args[0])
+    doc_message = (
+        "**DEPRECATED**: This API is deprecated and may be removed "
+        "in future Ray releases."
+    )
+    warning_message = (
+        "This API is deprecated and may be removed in future Ray releases. "
+        "You could suppress this warning by setting env variable "
+        'PYTHONWARNINGS="ignore::DeprecationWarning"'
+    )
+    warning = kwargs.pop("warning", False)
+    if "message" in kwargs:
+        doc_message = doc_message + "\n" + kwargs["message"]
+        warning_message = warning_message + "\n" + kwargs["message"]
+        del kwargs["message"]
+    if kwargs:
+        raise ValueError("Unknown kwargs: {}".format(kwargs.keys()))
+    def inner(obj):
+        _append_doc(obj, message=doc_message, directive="warning")
+        _mark_annotated(obj, type=AnnotationType.DEPRECATED)
+        if not warning:
+            return obj
+        if inspect.isclass(obj):
+            obj_init = obj.__init__
+            def patched_init(*args, **kwargs):
+                warnings.warn(warning_message, RayDeprecationWarning, stacklevel=2)
+                return obj_init(*args, **kwargs)
+            obj.__init__ = patched_init
+            return obj
+        else:
+            # class method or function.
+            @wraps(obj)
+            def wrapper(*args, **kwargs):
+                warnings.warn(warning_message, RayDeprecationWarning, stacklevel=2)
+                return obj(*args, **kwargs)
+            return wrapper
+    return inner
+def _append_doc(obj, *, message: str, directive: Optional[str] = None) -> str:
+    if not obj.__doc__:
+        obj.__doc__ = ""
+    obj.__doc__ = obj.__doc__.rstrip()
+    indent = _get_indent(obj.__doc__)
+    obj.__doc__ += "\n\n"
+    if directive is not None:
+        obj.__doc__ += f"{' ' * indent}.. {directive}::\n\n"
+        message = message.replace("\n", "\n" + " " * (indent + 4))
+        obj.__doc__ += f"{' ' * (indent + 4)}{message}"
+    else:
+        message = message.replace("\n", "\n" + " " * (indent + 4))
+        obj.__doc__ += f"{' ' * indent}{message}"
+    obj.__doc__ += f"\n{' ' * indent}"
+def _get_indent(docstring: str) -> int:
+    """
+    Example:
+        >>> def f():
+        ...     '''Docstring summary.'''
+        >>> f.__doc__
+        'Docstring summary.'
+        >>> _get_indent(f.__doc__)
+        0
+        >>> def g(foo):
+        ...     '''Docstring summary.
+        ...
+        ...     Args:
+        ...         foo: Does bar.
+        ...     '''
+        >>> g.__doc__
+        'Docstring summary.\\n\\n    Args:\\n        foo: Does bar.\\n    '
+        >>> _get_indent(g.__doc__)
+        4
+        >>> class A:
+        ...     def h():
+        ...         '''Docstring summary.
+        ...
+        ...         Returns:
+        ...             None.
+        ...         '''
+        >>> A.h.__doc__
+        'Docstring summary.\\n\\n        Returns:\\n            None.\\n        '
+        >>> _get_indent(A.h.__doc__)
+        8
+    """
+    if not docstring:
+        return 0
+    non_empty_lines = list(filter(bool, docstring.splitlines()))
+    if len(non_empty_lines) == 1:
+        # Docstring contains summary only.
+        return 0
+    # The docstring summary isn't indented, so check the indentation of the second
+    # non-empty line.
+    return len(non_empty_lines[1]) - len(non_empty_lines[1].lstrip())
+def _mark_annotated(
+    obj, type: AnnotationType = AnnotationType.UNKNOWN, api_group="Others"
+) -> None:
+    # Set magic token for check_api_annotations linter.
+    if hasattr(obj, "__name__"):
+        obj._annotated = obj.__name__
+        obj._annotated_type = type
+        obj._annotated_api_group = api_group
+def _is_annotated(obj) -> bool:
+    # Check the magic token exists and applies to this class (not a subclass).
+    return hasattr(obj, "_annotated") and obj._annotated == obj.__name__
+def _get_annotation_type(obj) -> Optional[str]:
+    if not _is_annotated(obj):
+        return None
+    return obj._annotated_type.value

.venv/lib/python3.11/site-packages/ray/util/client/api.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""This file defines the interface between the ray client worker
+and the overall ray module API.
+"""
+import json
+import logging
+from concurrent.futures import Future
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union
+from ray._private import ray_option_utils
+from ray.util.client.runtime_context import _ClientWorkerPropertyAPI
+if TYPE_CHECKING:
+    from ray.actor import ActorClass
+    from ray.core.generated.ray_client_pb2 import DataResponse
+    from ray.remote_function import RemoteFunction
+    from ray.util.client.common import ClientActorHandle, ClientObjectRef, ClientStub
+logger = logging.getLogger(__name__)
+def _as_bytes(value):
+    if isinstance(value, str):
+        return value.encode("utf-8")
+    return value
+class _ClientAPI:
+    """The Client-side methods corresponding to the ray API. Delegates
+    to the Client Worker that contains the connection to the ClientServer.
+    """
+    def __init__(self, worker=None):
+        self.worker = worker
+    def get(self, vals, *, timeout=None):
+        """get is the hook stub passed on to replace `ray.get`
+        Args:
+            vals: [Client]ObjectRef or list of these refs to retrieve.
+            timeout: Optional timeout in milliseconds
+        """
+        return self.worker.get(vals, timeout=timeout)
+    def put(self, *args, **kwargs):
+        """put is the hook stub passed on to replace `ray.put`
+        Args:
+            val: The value to `put`.
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
+        return self.worker.put(*args, **kwargs)
+    def wait(self, *args, **kwargs):
+        """wait is the hook stub passed on to replace `ray.wait`
+        Args:
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
+        return self.worker.wait(*args, **kwargs)
+    def remote(self, *args, **kwargs):
+        """remote is the hook stub passed on to replace `ray.remote`.
+        This sets up remote functions or actors, as the decorator,
+        but does not execute them.
+        Args:
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
+        # Delayed import to avoid a cyclic import
+        from ray.util.client.common import remote_decorator
+        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+            # This is the case where the decorator is just @ray.remote.
+            return remote_decorator(options=None)(args[0])
+        assert (
+            len(args) == 0 and len(kwargs) > 0
+        ), ray_option_utils.remote_args_error_string
+        return remote_decorator(options=kwargs)
+    # TODO(mwtian): consider adding _internal_ prefix to call_remote /
+    # call_release / call_retain.
+    def call_remote(self, instance: "ClientStub", *args, **kwargs) -> List[Future]:
+        """call_remote is called by stub objects to execute them remotely.
+        This is used by stub objects in situations where they're called
+        with .remote, eg, `f.remote()` or `actor_cls.remote()`.
+        This allows the client stub objects to delegate execution to be
+        implemented in the most effective way whether it's in the client,
+        clientserver, or raylet worker.
+        Args:
+            instance: The Client-side stub reference to a remote object
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
+        return self.worker.call_remote(instance, *args, **kwargs)
+    def call_release(self, id: bytes) -> None:
+        """Attempts to release an object reference.
+        When client references are destructed, they release their reference,
+        which can opportunistically send a notification through the datachannel
+        to release the reference being held for that object on the server.
+        Args:
+            id: The id of the reference to release on the server side.
+        """
+        return self.worker.call_release(id)
+    def call_retain(self, id: bytes) -> None:
+        """Attempts to retain a client object reference.
+        Increments the reference count on the client side, to prevent
+        the client worker from attempting to release the server reference.
+        Args:
+            id: The id of the reference to retain on the client side.
+        """
+        return self.worker.call_retain(id)
+    def close(self) -> None:
+        """close cleans up an API connection by closing any channels or
+        shutting down any servers gracefully.
+        """
+        return self.worker.close()
+    def get_actor(
+        self, name: str, namespace: Optional[str] = None
+    ) -> "ClientActorHandle":
+        """Returns a handle to an actor by name.
+        Args:
+            name: The name passed to this actor by
+              Actor.options(name="name").remote()
+        """
+        return self.worker.get_actor(name, namespace)
+    def list_named_actors(self, all_namespaces: bool = False) -> List[str]:
+        """List all named actors in the system.
+        Actors must have been created with Actor.options(name="name").remote().
+        This works for both detached & non-detached actors.
+        By default, only actors in the current namespace will be returned
+        and the returned entries will simply be their name.
+        If `all_namespaces` is set to True, all actors in the cluster will be
+        returned regardless of namespace, and the retunred entries will be of
+        the form '<namespace>/<name>'.
+        """
+        return self.worker.list_named_actors(all_namespaces)
+    def kill(self, actor: "ClientActorHandle", *, no_restart=True):
+        """kill forcibly stops an actor running in the cluster
+        Args:
+            no_restart: Whether this actor should be restarted if it's a
+              restartable actor.
+        """
+        return self.worker.terminate_actor(actor, no_restart)
+    def cancel(self, obj: "ClientObjectRef", *, force=False, recursive=True):
+        """Cancels a task on the cluster.
+        If the specified task is pending execution, it will not be executed. If
+        the task is currently executing, the behavior depends on the ``force``
+        flag, as per `ray.cancel()`
+        Only non-actor tasks can be canceled. Canceled tasks will not be
+        retried (max_retries will not be respected).
+        Args:
+            object_ref: ObjectRef returned by the task
+                that should be canceled.
+            force: Whether to force-kill a running task by killing
+                the worker that is running the task.
+            recursive: Whether to try to cancel tasks submitted by
+                the task specified.
+        """
+        return self.worker.terminate_task(obj, force, recursive)
+    # Various metadata methods for the client that are defined in the protocol.
+    def is_initialized(self) -> bool:
+        """True if our client is connected, and if the server is initialized.
+        Returns:
+            A boolean determining if the client is connected and
+            server initialized.
+        """
+        return self.worker.is_initialized()
+    def nodes(self):
+        """Get a list of the nodes in the cluster (for debugging only).
+        Returns:
+            Information about the Ray clients in the cluster.
+        """
+        # This should be imported here, otherwise, it will error doc build.
+        import ray.core.generated.ray_client_pb2 as ray_client_pb2
+        return self.worker.get_cluster_info(ray_client_pb2.ClusterInfoType.NODES)
+    def method(self, *args, **kwargs):
+        """Annotate an actor method
+        Args:
+            num_returns: The number of object refs that should be returned by
+                invocations of this actor method.
+        """
+        # NOTE: So this follows the same logic as in ray/actor.py::method()
+        # The reason to duplicate it here is to simplify the client mode
+        # redirection logic. As the annotated method gets pickled and sent to
+        # the server from the client it carries this private variable, it
+        # activates the same logic on the server side; so there's no need to
+        # pass anything else. It's inside the class definition that becomes an
+        # actor. Similar annotations would follow the same way.
+        valid_kwargs = ["num_returns", "concurrency_group"]
+        error_string = (
+            "The @ray.method decorator must be applied using at least one of "
+            f"the arguments in the list {valid_kwargs}, for example "
+            "'@ray.method(num_returns=2)'."
+        )
+        assert len(args) == 0 and len(kwargs) > 0, error_string
+        for key in kwargs:
+            key_error_string = (
+                f'Unexpected keyword argument to @ray.method: "{key}". The '
+                f"supported keyword arguments are {valid_kwargs}"
+            )
+            assert key in valid_kwargs, key_error_string
+        def annotate_method(method):
+            if "num_returns" in kwargs:
+                method.__ray_num_returns__ = kwargs["num_returns"]
+            if "concurrency_group" in kwargs:
+                method.__ray_concurrency_group__ = kwargs["concurrency_group"]
+            return method
+        return annotate_method
+    def cluster_resources(self):
+        """Get the current total cluster resources.
+        Note that this information can grow stale as nodes are added to or
+        removed from the cluster.
+        Returns:
+            A dictionary mapping resource name to the total quantity of that
+                resource in the cluster.
+        """
+        # This should be imported here, otherwise, it will error doc build.
+        import ray.core.generated.ray_client_pb2 as ray_client_pb2
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES
+        )
+    def available_resources(self):
+        """Get the current available cluster resources.
+        This is different from `cluster_resources` in that this will return
+        idle (available) resources rather than total resources.
+        Note that this information can grow stale as tasks start and finish.
+        Returns:
+            A dictionary mapping resource name to the total quantity of that
+                resource in the cluster.
+        """
+        # This should be imported here, otherwise, it will error doc build.
+        import ray.core.generated.ray_client_pb2 as ray_client_pb2
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES
+        )
+    def get_runtime_context(self):
+        """Return a Ray RuntimeContext describing the state on the server
+        Returns:
+            A RuntimeContext wrapping a client making get_cluster_info calls.
+        """
+        return _ClientWorkerPropertyAPI(self.worker).build_runtime_context()
+    # Client process isn't assigned any GPUs.
+    def get_gpu_ids(self) -> list:
+        return []
+    def timeline(self, filename: Optional[str] = None) -> Optional[List[Any]]:
+        logger.warning(
+            "Timeline will include events from other clients using this server."
+        )
+        # This should be imported here, otherwise, it will error doc build.
+        import ray.core.generated.ray_client_pb2 as ray_client_pb2
+        all_events = self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.TIMELINE
+        )
+        if filename is not None:
+            with open(filename, "w") as outfile:
+                json.dump(all_events, outfile)
+        else:
+            return all_events
+    def _internal_kv_initialized(self) -> bool:
+        """Hook for internal_kv._internal_kv_initialized."""
+        # NOTE(edoakes): the kv is always initialized because we initialize it
+        # manually in the proxier with a GCS client if Ray hasn't been
+        # initialized yet.
+        return True
+    def _internal_kv_exists(
+        self, key: Union[str, bytes], *, namespace: Optional[Union[str, bytes]] = None
+    ) -> bool:
+        """Hook for internal_kv._internal_kv_exists."""
+        return self.worker.internal_kv_exists(
+            _as_bytes(key), namespace=_as_bytes(namespace)
+        )
+    def _internal_kv_get(
+        self, key: Union[str, bytes], *, namespace: Optional[Union[str, bytes]] = None
+    ) -> bytes:
+        """Hook for internal_kv._internal_kv_get."""
+        return self.worker.internal_kv_get(
+            _as_bytes(key), namespace=_as_bytes(namespace)
+        )
+    def _internal_kv_put(
+        self,
+        key: Union[str, bytes],
+        value: Union[str, bytes],
+        overwrite: bool = True,
+        *,
+        namespace: Optional[Union[str, bytes]] = None,
+    ) -> bool:
+        """Hook for internal_kv._internal_kv_put."""
+        return self.worker.internal_kv_put(
+            _as_bytes(key), _as_bytes(value), overwrite, namespace=_as_bytes(namespace)
+        )
+    def _internal_kv_del(
+        self,
+        key: Union[str, bytes],
+        *,
+        del_by_prefix: bool = False,
+        namespace: Optional[Union[str, bytes]] = None,
+    ) -> int:
+        """Hook for internal_kv._internal_kv_del."""
+        return self.worker.internal_kv_del(
+            _as_bytes(key), del_by_prefix=del_by_prefix, namespace=_as_bytes(namespace)
+        )
+    def _internal_kv_list(
+        self,
+        prefix: Union[str, bytes],
+        *,
+        namespace: Optional[Union[str, bytes]] = None,
+    ) -> List[bytes]:
+        """Hook for internal_kv._internal_kv_list."""
+        return self.worker.internal_kv_list(
+            _as_bytes(prefix), namespace=_as_bytes(namespace)
+        )
+    def _pin_runtime_env_uri(self, uri: str, expiration_s: int) -> None:
+        """Hook for internal_kv._pin_runtime_env_uri."""
+        return self.worker.pin_runtime_env_uri(uri, expiration_s)
+    def _convert_actor(self, actor: "ActorClass") -> str:
+        """Register a ClientActorClass for the ActorClass and return a UUID"""
+        return self.worker._convert_actor(actor)
+    def _convert_function(self, func: "RemoteFunction") -> str:
+        """Register a ClientRemoteFunc for the ActorClass and return a UUID"""
+        return self.worker._convert_function(func)
+    def _get_converted(self, key: str) -> "ClientStub":
+        """Given a UUID, return the converted object"""
+        return self.worker._get_converted(key)
+    def _converted_key_exists(self, key: str) -> bool:
+        """Check if a key UUID is present in the store of converted objects."""
+        return self.worker._converted_key_exists(key)
+    def __getattr__(self, key: str):
+        if not key.startswith("_"):
+            raise NotImplementedError(
+                "Not available in Ray client: `ray.{}`. This method is only "
+                "available within Ray remote functions and is not yet "
+                "implemented in the client API.".format(key)
+            )
+        return self.__getattribute__(key)
+    def _register_callback(
+        self, ref: "ClientObjectRef", callback: Callable[["DataResponse"], None]
+    ) -> None:
+        self.worker.register_callback(ref, callback)
+    def _get_dashboard_url(self) -> str:
+        import ray.core.generated.ray_client_pb2 as ray_client_pb2
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.DASHBOARD_URL
+        ).get("dashboard_url", "")

.venv/lib/python3.11/site-packages/ray/util/client/client_app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from ray.util.client import ray
+from typing import Tuple
+ray.connect("localhost:50051")
+@ray.remote
+class HelloActor:
+    def __init__(self):
+        self.count = 0
+    def say_hello(self, whom: str) -> Tuple[str, int]:
+        self.count += 1
+        return ("Hello " + whom, self.count)
+actor = HelloActor.remote()
+s, count = ray.get(actor.say_hello.remote("you"))
+print(s, count)
+assert s == "Hello you"
+assert count == 1
+s, count = ray.get(actor.say_hello.remote("world"))
+print(s, count)
+assert s == "Hello world"
+assert count == 2
+@ray.remote
+def plus2(x):
+    return x + 2
+@ray.remote
+def fact(x):
+    print(x, type(fact))
+    if x <= 0:
+        return 1
+    # This hits the "nested tasks" issue
+    # https://github.com/ray-project/ray/issues/3644
+    # So we're on the right track!
+    return ray.get(fact.remote(x - 1)) * x
+@ray.remote
+def get_nodes():
+    return ray.nodes()  # Can access the full Ray API in remote methods.
+print("Cluster nodes", ray.get(get_nodes.remote()))
+print(ray.nodes())
+objectref = ray.put("hello world")
+# `ClientObjectRef(...)`
+print(objectref)
+# `hello world`
+print(ray.get(objectref))
+ref2 = plus2.remote(234)
+# `ClientObjectRef(...)`
+print(ref2)
+# `236`
+print(ray.get(ref2))
+ref3 = fact.remote(20)
+# `ClientObjectRef(...)`
+print(ref3)
+# `2432902008176640000`
+print(ray.get(ref3))
+# Reuse the cached ClientRemoteFunc object
+ref4 = fact.remote(5)
+# `120`
+print(ray.get(ref4))
+ref5 = fact.remote(10)
+print([ref2, ref3, ref4, ref5])
+# should return ref2, ref3, ref4
+res = ray.wait([ref5, ref2, ref3, ref4], num_returns=3)
+print(res)
+assert [ref2, ref3, ref4] == res[0]
+assert [ref5] == res[1]
+# should return ref2, ref3, ref4, ref5
+res = ray.wait([ref2, ref3, ref4, ref5], num_returns=4)
+print(res)
+assert [ref2, ref3, ref4, ref5] == res[0]
+assert [] == res[1]

.venv/lib/python3.11/site-packages/ray/util/client/common.py ADDED Viewed

	@@ -0,0 +1,956 @@

+import inspect
+import logging
+import os
+import pickle
+import threading
+import uuid
+from collections import OrderedDict
+from concurrent.futures import Future
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import grpc
+import ray._raylet as raylet
+import ray.core.generated.ray_client_pb2 as ray_client_pb2
+import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
+from ray._private import ray_constants
+from ray._private.inspect_util import (
+    is_class_method,
+    is_cython,
+    is_function_or_method,
+    is_static_method,
+)
+from ray._private.signature import extract_signature, get_signature
+from ray._private.utils import check_oversized_function
+from ray.util.client import ray
+from ray.util.client.options import validate_options
+logger = logging.getLogger(__name__)
+# The maximum field value for int32 id's -- which is also the maximum
+# number of simultaneous in-flight requests.
+INT32_MAX = (2**31) - 1
+# gRPC status codes that the client shouldn't attempt to recover from
+# Resource exhausted: Server is low on resources, or has hit the max number
+#   of client connections
+# Invalid argument: Reserved for application errors
+# Not found: Set if the client is attempting to reconnect to a session that
+#   does not exist
+# Failed precondition: Reserverd for application errors
+# Aborted: Set when an error is serialized into the details of the context,
+#   signals that error should be deserialized on the client side
+GRPC_UNRECOVERABLE_ERRORS = (
+    grpc.StatusCode.RESOURCE_EXHAUSTED,
+    grpc.StatusCode.INVALID_ARGUMENT,
+    grpc.StatusCode.NOT_FOUND,
+    grpc.StatusCode.FAILED_PRECONDITION,
+    grpc.StatusCode.ABORTED,
+)
+# TODO: Instead of just making the max message size large, the right thing to
+# do is to split up the bytes representation of serialized data into multiple
+# messages and reconstruct them on either end. That said, since clients are
+# drivers and really just feed initial things in and final results out, (when
+# not going to S3 or similar) then a large limit will suffice for many use
+# cases.
+#
+# Currently, this is 2GiB, the max for a signed int.
+GRPC_MAX_MESSAGE_SIZE = (2 * 1024 * 1024 * 1024) - 1
+# 30 seconds because ELB timeout is 60 seconds
+GRPC_KEEPALIVE_TIME_MS = 1000 * 30
+# Long timeout because we do not want gRPC ending a connection.
+GRPC_KEEPALIVE_TIMEOUT_MS = 1000 * 600
+GRPC_OPTIONS = [
+    *ray_constants.GLOBAL_GRPC_OPTIONS,
+    ("grpc.max_send_message_length", GRPC_MAX_MESSAGE_SIZE),
+    ("grpc.max_receive_message_length", GRPC_MAX_MESSAGE_SIZE),
+    ("grpc.keepalive_time_ms", GRPC_KEEPALIVE_TIME_MS),
+    ("grpc.keepalive_timeout_ms", GRPC_KEEPALIVE_TIMEOUT_MS),
+    ("grpc.keepalive_permit_without_calls", 1),
+    # Send an infinite number of pings
+    ("grpc.http2.max_pings_without_data", 0),
+    ("grpc.http2.min_ping_interval_without_data_ms", GRPC_KEEPALIVE_TIME_MS - 50),
+    # Allow many strikes
+    ("grpc.http2.max_ping_strikes", 0),
+]
+CLIENT_SERVER_MAX_THREADS = float(os.getenv("RAY_CLIENT_SERVER_MAX_THREADS", 100))
+# Large objects are chunked into 5 MiB messages, ref PR #35025
+OBJECT_TRANSFER_CHUNK_SIZE = 5 * 2**20
+# Warn the user if the object being transferred is larger than 2 GiB
+OBJECT_TRANSFER_WARNING_SIZE = 2 * 2**30
+class ClientObjectRef(raylet.ObjectRef):
+    def __init__(self, id: Union[bytes, Future]):
+        self._mutex = threading.Lock()
+        self._worker = ray.get_context().client_worker
+        self._id_future = None
+        if isinstance(id, bytes):
+            self._set_id(id)
+        elif isinstance(id, Future):
+            self._id_future = id
+        else:
+            raise TypeError("Unexpected type for id {}".format(id))
+    def __del__(self):
+        if self._worker is not None and self._worker.is_connected():
+            try:
+                if not self.is_nil():
+                    self._worker.call_release(self.id)
+            except Exception:
+                logger.info(
+                    "Exception in ObjectRef is ignored in destructor. "
+                    "To receive this exception in application code, call "
+                    "a method on the actor reference before its destructor "
+                    "is run."
+                )
+    def binary(self):
+        self._wait_for_id()
+        return super().binary()
+    def hex(self):
+        self._wait_for_id()
+        return super().hex()
+    def is_nil(self):
+        self._wait_for_id()
+        return super().is_nil()
+    def __hash__(self):
+        self._wait_for_id()
+        return hash(self.id)
+    def task_id(self):
+        self._wait_for_id()
+        return super().task_id()
+    @property
+    def id(self):
+        return self.binary()
+    def future(self) -> Future:
+        fut = Future()
+        def set_future(data: Any) -> None:
+            """Schedules a callback to set the exception or result
+            in the Future."""
+            if isinstance(data, Exception):
+                fut.set_exception(data)
+            else:
+                fut.set_result(data)
+        self._on_completed(set_future)
+        # Prevent this object ref from being released.
+        fut.object_ref = self
+        return fut
+    def _on_completed(self, py_callback: Callable[[Any], None]) -> None:
+        """Register a callback that will be called after Object is ready.
+        If the ObjectRef is already ready, the callback will be called soon.
+        The callback should take the result as the only argument. The result
+        can be an exception object in case of task error.
+        """
+        def deserialize_obj(
+            resp: Union[ray_client_pb2.DataResponse, Exception]
+        ) -> None:
+            from ray.util.client.client_pickler import loads_from_server
+            if isinstance(resp, Exception):
+                data = resp
+            elif isinstance(resp, bytearray):
+                data = loads_from_server(resp)
+            else:
+                obj = resp.get
+                data = None
+                if not obj.valid:
+                    data = loads_from_server(resp.get.error)
+                else:
+                    data = loads_from_server(resp.get.data)
+            py_callback(data)
+        self._worker.register_callback(self, deserialize_obj)
+    def _set_id(self, id):
+        super()._set_id(id)
+        self._worker.call_retain(id)
+    def _wait_for_id(self, timeout=None):
+        if self._id_future:
+            with self._mutex:
+                if self._id_future:
+                    self._set_id(self._id_future.result(timeout=timeout))
+                    self._id_future = None
+class ClientActorRef(raylet.ActorID):
+    def __init__(
+        self,
+        id: Union[bytes, Future],
+        weak_ref: Optional[bool] = False,
+    ):
+        self._weak_ref = weak_ref
+        self._mutex = threading.Lock()
+        self._worker = ray.get_context().client_worker
+        if isinstance(id, bytes):
+            self._set_id(id)
+            self._id_future = None
+        elif isinstance(id, Future):
+            self._id_future = id
+        else:
+            raise TypeError("Unexpected type for id {}".format(id))
+    def __del__(self):
+        if self._weak_ref:
+            return
+        if self._worker is not None and self._worker.is_connected():
+            try:
+                if not self.is_nil():
+                    self._worker.call_release(self.id)
+            except Exception:
+                logger.debug(
+                    "Exception from actor creation is ignored in destructor. "
+                    "To receive this exception in application code, call "
+                    "a method on the actor reference before its destructor "
+                    "is run."
+                )
+    def binary(self):
+        self._wait_for_id()
+        return super().binary()
+    def hex(self):
+        self._wait_for_id()
+        return super().hex()
+    def is_nil(self):
+        self._wait_for_id()
+        return super().is_nil()
+    def __hash__(self):
+        self._wait_for_id()
+        return hash(self.id)
+    @property
+    def id(self):
+        return self.binary()
+    def _set_id(self, id):
+        super()._set_id(id)
+        self._worker.call_retain(id)
+    def _wait_for_id(self, timeout=None):
+        if self._id_future:
+            with self._mutex:
+                if self._id_future:
+                    self._set_id(self._id_future.result(timeout=timeout))
+                    self._id_future = None
+class ClientStub:
+    pass
+class ClientRemoteFunc(ClientStub):
+    """A stub created on the Ray Client to represent a remote
+    function that can be exectued on the cluster.
+    This class is allowed to be passed around between remote functions.
+    Args:
+        _func: The actual function to execute remotely
+        _name: The original name of the function
+        _ref: The ClientObjectRef of the pickled code of the function, _func
+    """
+    def __init__(self, f, options=None):
+        self._lock = threading.Lock()
+        self._func = f
+        self._name = f.__name__
+        self._signature = get_signature(f)
+        self._ref = None
+        self._client_side_ref = ClientSideRefID.generate_id()
+        self._options = validate_options(options)
+    def __call__(self, *args, **kwargs):
+        raise TypeError(
+            "Remote function cannot be called directly. "
+            f"Use {self._name}.remote method instead"
+        )
+    def remote(self, *args, **kwargs):
+        # Check if supplied parameters match the function signature. Same case
+        # at the other callsites.
+        self._signature.bind(*args, **kwargs)
+        return return_refs(ray.call_remote(self, *args, **kwargs))
+    def options(self, **kwargs):
+        return OptionWrapper(self, kwargs)
+    def _remote(self, args=None, kwargs=None, **option_args):
+        if args is None:
+            args = []
+        if kwargs is None:
+            kwargs = {}
+        return self.options(**option_args).remote(*args, **kwargs)
+    def __repr__(self):
+        return "ClientRemoteFunc(%s, %s)" % (self._name, self._ref)
+    def _ensure_ref(self):
+        with self._lock:
+            if self._ref is None:
+                # While calling ray.put() on our function, if
+                # our function is recursive, it will attempt to
+                # encode the ClientRemoteFunc -- itself -- and
+                # infinitely recurse on _ensure_ref.
+                #
+                # So we set the state of the reference to be an
+                # in-progress self reference value, which
+                # the encoding can detect and handle correctly.
+                self._ref = InProgressSentinel()
+                data = ray.worker._dumps_from_client(self._func)
+                # Check pickled size before sending it to server, which is more
+                # efficient and can be done synchronously inside remote() call.
+                check_oversized_function(data, self._name, "remote function", None)
+                self._ref = ray.worker._put_pickled(
+                    data, client_ref_id=self._client_side_ref.id
+                )
+    def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
+        self._ensure_ref()
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.FUNCTION
+        task.name = self._name
+        task.payload_id = self._ref.id
+        set_task_options(task, self._options, "baseline_options")
+        return task
+    def _num_returns(self) -> int:
+        if not self._options:
+            return None
+        return self._options.get("num_returns")
+class ClientActorClass(ClientStub):
+    """A stub created on the Ray Client to represent an actor class.
+    It is wrapped by ray.remote and can be executed on the cluster.
+    Args:
+        actor_cls: The actual class to execute remotely
+        _name: The original name of the class
+        _ref: The ClientObjectRef of the pickled `actor_cls`
+    """
+    def __init__(self, actor_cls, options=None):
+        self.actor_cls = actor_cls
+        self._lock = threading.Lock()
+        self._name = actor_cls.__name__
+        self._init_signature = inspect.Signature(
+            parameters=extract_signature(actor_cls.__init__, ignore_first=True)
+        )
+        self._ref = None
+        self._client_side_ref = ClientSideRefID.generate_id()
+        self._options = validate_options(options)
+    def __call__(self, *args, **kwargs):
+        raise TypeError(
+            "Remote actor cannot be instantiated directly. "
+            f"Use {self._name}.remote() instead"
+        )
+    def _ensure_ref(self):
+        with self._lock:
+            if self._ref is None:
+                # As before, set the state of the reference to be an
+                # in-progress self reference value, which
+                # the encoding can detect and handle correctly.
+                self._ref = InProgressSentinel()
+                data = ray.worker._dumps_from_client(self.actor_cls)
+                # Check pickled size before sending it to server, which is more
+                # efficient and can be done synchronously inside remote() call.
+                check_oversized_function(data, self._name, "actor", None)
+                self._ref = ray.worker._put_pickled(
+                    data, client_ref_id=self._client_side_ref.id
+                )
+    def remote(self, *args, **kwargs) -> "ClientActorHandle":
+        self._init_signature.bind(*args, **kwargs)
+        # Actually instantiate the actor
+        futures = ray.call_remote(self, *args, **kwargs)
+        assert len(futures) == 1
+        return ClientActorHandle(ClientActorRef(futures[0]), actor_class=self)
+    def options(self, **kwargs):
+        return ActorOptionWrapper(self, kwargs)
+    def _remote(self, args=None, kwargs=None, **option_args):
+        if args is None:
+            args = []
+        if kwargs is None:
+            kwargs = {}
+        return self.options(**option_args).remote(*args, **kwargs)
+    def __repr__(self):
+        return "ClientActorClass(%s, %s)" % (self._name, self._ref)
+    def __getattr__(self, key):
+        if key not in self.__dict__:
+            raise AttributeError("Not a class attribute")
+        raise NotImplementedError("static methods")
+    def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
+        self._ensure_ref()
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.ACTOR
+        task.name = self._name
+        task.payload_id = self._ref.id
+        set_task_options(task, self._options, "baseline_options")
+        return task
+    @staticmethod
+    def _num_returns() -> int:
+        return 1
+class ClientActorHandle(ClientStub):
+    """Client-side stub for instantiated actor.
+    A stub created on the Ray Client to represent a remote actor that
+    has been started on the cluster.  This class is allowed to be passed
+    around between remote functions.
+    Args:
+        actor_ref: A reference to the running actor given to the client. This
+          is a serialized version of the actual handle as an opaque token.
+    """
+    def __init__(
+        self,
+        actor_ref: ClientActorRef,
+        actor_class: Optional[ClientActorClass] = None,
+    ):
+        self.actor_ref = actor_ref
+        self._dir: Optional[List[str]] = None
+        if actor_class is not None:
+            self._method_num_returns = {}
+            self._method_signatures = {}
+            for method_name, method_obj in inspect.getmembers(
+                actor_class.actor_cls, is_function_or_method
+            ):
+                self._method_num_returns[method_name] = getattr(
+                    method_obj, "__ray_num_returns__", None
+                )
+                self._method_signatures[method_name] = inspect.Signature(
+                    parameters=extract_signature(
+                        method_obj,
+                        ignore_first=(
+                            not (
+                                is_class_method(method_obj)
+                                or is_static_method(actor_class.actor_cls, method_name)
+                            )
+                        ),
+                    )
+                )
+        else:
+            self._method_num_returns = None
+            self._method_signatures = None
+    def __dir__(self) -> List[str]:
+        if self._method_num_returns is not None:
+            return self._method_num_returns.keys()
+        if ray.is_connected():
+            self._init_class_info()
+            return self._method_num_returns.keys()
+        return super().__dir__()
+    # For compatibility with core worker ActorHandle._actor_id which returns
+    # ActorID
+    @property
+    def _actor_id(self) -> ClientActorRef:
+        return self.actor_ref
+    def __hash__(self) -> int:
+        return hash(self._actor_id)
+    def __eq__(self, __value) -> bool:
+        return hash(self) == hash(__value)
+    def __getattr__(self, key):
+        if key == "_method_num_returns":
+            # We need to explicitly handle this value since it is used below,
+            # otherwise we may end up infinitely recursing when deserializing.
+            # This can happen after unpickling an object but before
+            # _method_num_returns is correctly populated.
+            raise AttributeError(f"ClientActorRef has no attribute '{key}'")
+        if self._method_num_returns is None:
+            self._init_class_info()
+        if key not in self._method_signatures:
+            raise AttributeError(f"ClientActorRef has no attribute '{key}'")
+        return ClientRemoteMethod(
+            self,
+            key,
+            self._method_num_returns.get(key),
+            self._method_signatures.get(key),
+        )
+    def __repr__(self):
+        return "ClientActorHandle(%s)" % (self.actor_ref.id.hex())
+    def _init_class_info(self):
+        # TODO: fetch Ray method decorators
+        @ray.remote(num_cpus=0)
+        def get_class_info(x):
+            return x._ray_method_num_returns, x._ray_method_signatures
+        self._method_num_returns, method_parameters = ray.get(
+            get_class_info.remote(self)
+        )
+        self._method_signatures = {}
+        for method, parameters in method_parameters.items():
+            self._method_signatures[method] = inspect.Signature(parameters=parameters)
+class ClientRemoteMethod(ClientStub):
+    """A stub for a method on a remote actor.
+    Can be annotated with execution options.
+    Args:
+        actor_handle: A reference to the ClientActorHandle that generated
+          this method and will have this method called upon it.
+        method_name: The name of this method
+    """
+    def __init__(
+        self,
+        actor_handle: ClientActorHandle,
+        method_name: str,
+        num_returns: int,
+        signature: inspect.Signature,
+    ):
+        self._actor_handle = actor_handle
+        self._method_name = method_name
+        self._method_num_returns = num_returns
+        self._signature = signature
+    def __call__(self, *args, **kwargs):
+        raise TypeError(
+            "Actor methods cannot be called directly. Instead "
+            f"of running 'object.{self._method_name}()', try "
+            f"'object.{self._method_name}.remote()'."
+        )
+    def remote(self, *args, **kwargs):
+        self._signature.bind(*args, **kwargs)
+        return return_refs(ray.call_remote(self, *args, **kwargs))
+    def __repr__(self):
+        return "ClientRemoteMethod(%s, %s, %s)" % (
+            self._method_name,
+            self._actor_handle,
+            self._method_num_returns,
+        )
+    def options(self, **kwargs):
+        return OptionWrapper(self, kwargs)
+    def _remote(self, args=None, kwargs=None, **option_args):
+        if args is None:
+            args = []
+        if kwargs is None:
+            kwargs = {}
+        return self.options(**option_args).remote(*args, **kwargs)
+    def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.METHOD
+        task.name = self._method_name
+        task.payload_id = self._actor_handle.actor_ref.id
+        return task
+    def _num_returns(self) -> int:
+        return self._method_num_returns
+class OptionWrapper:
+    def __init__(self, stub: ClientStub, options: Optional[Dict[str, Any]]):
+        self._remote_stub = stub
+        self._options = validate_options(options)
+    def remote(self, *args, **kwargs):
+        self._remote_stub._signature.bind(*args, **kwargs)
+        return return_refs(ray.call_remote(self, *args, **kwargs))
+    def __getattr__(self, key):
+        return getattr(self._remote_stub, key)
+    def _prepare_client_task(self):
+        task = self._remote_stub._prepare_client_task()
+        set_task_options(task, self._options)
+        return task
+    def _num_returns(self) -> int:
+        if self._options:
+            num = self._options.get("num_returns")
+            if num is not None:
+                return num
+        return self._remote_stub._num_returns()
+class ActorOptionWrapper(OptionWrapper):
+    def remote(self, *args, **kwargs):
+        self._remote_stub._init_signature.bind(*args, **kwargs)
+        futures = ray.call_remote(self, *args, **kwargs)
+        assert len(futures) == 1
+        actor_class = None
+        if isinstance(self._remote_stub, ClientActorClass):
+            actor_class = self._remote_stub
+        return ClientActorHandle(ClientActorRef(futures[0]), actor_class=actor_class)
+def set_task_options(
+    task: ray_client_pb2.ClientTask,
+    options: Optional[Dict[str, Any]],
+    field: str = "options",
+) -> None:
+    if options is None:
+        task.ClearField(field)
+        return
+    getattr(task, field).pickled_options = pickle.dumps(options)
+def return_refs(
+    futures: List[Future],
+) -> Union[None, ClientObjectRef, List[ClientObjectRef]]:
+    if not futures:
+        return None
+    if len(futures) == 1:
+        return ClientObjectRef(futures[0])
+    return [ClientObjectRef(fut) for fut in futures]
+class InProgressSentinel:
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class ClientSideRefID:
+    """An ID generated by the client for objects not yet given an ObjectRef"""
+    def __init__(self, id: bytes):
+        assert len(id) != 0
+        self.id = id
+    @staticmethod
+    def generate_id() -> "ClientSideRefID":
+        tid = uuid.uuid4()
+        return ClientSideRefID(b"\xcc" + tid.bytes)
+def remote_decorator(options: Optional[Dict[str, Any]]):
+    def decorator(function_or_class) -> ClientStub:
+        if inspect.isfunction(function_or_class) or is_cython(function_or_class):
+            return ClientRemoteFunc(function_or_class, options=options)
+        elif inspect.isclass(function_or_class):
+            return ClientActorClass(function_or_class, options=options)
+        else:
+            raise TypeError(
+                "The @ray.remote decorator must be applied to "
+                "either a function or to a class."
+            )
+    return decorator
+@dataclass
+class ClientServerHandle:
+    """Holds the handles to the registered gRPC servicers and their server."""
+    task_servicer: ray_client_pb2_grpc.RayletDriverServicer
+    data_servicer: ray_client_pb2_grpc.RayletDataStreamerServicer
+    logs_servicer: ray_client_pb2_grpc.RayletLogStreamerServicer
+    grpc_server: grpc.Server
+    def stop(self, grace: int) -> None:
+        # The data servicer might be sleeping while waiting for clients to
+        # reconnect. Signal that they no longer have to sleep and can exit
+        # immediately, since the RPC server is stopped.
+        self.grpc_server.stop(grace)
+        self.data_servicer.stopped.set()
+    # Add a hook for all the cases that previously
+    # expected simply a gRPC server
+    def __getattr__(self, attr):
+        return getattr(self.grpc_server, attr)
+def _get_client_id_from_context(context: Any) -> str:
+    """
+    Get `client_id` from gRPC metadata. If the `client_id` is not present,
+    this function logs an error and sets the status_code.
+    """
+    metadata = {k: v for k, v in context.invocation_metadata()}
+    client_id = metadata.get("client_id") or ""
+    if client_id == "":
+        logger.error("Client connecting with no client_id")
+        context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+    return client_id
+def _propagate_error_in_context(e: Exception, context: Any) -> bool:
+    """
+    Encode an error into the context of an RPC response. Returns True
+    if the error can be recovered from, false otherwise
+    """
+    try:
+        if isinstance(e, grpc.RpcError):
+            # RPC error, propagate directly by copying details into context
+            context.set_code(e.code())
+            context.set_details(e.details())
+            return e.code() not in GRPC_UNRECOVERABLE_ERRORS
+    except Exception:
+        # Extra precaution -- if encoding the RPC directly fails fallback
+        # to treating it as a regular error
+        pass
+    context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+    context.set_details(str(e))
+    return False
+def _id_is_newer(id1: int, id2: int) -> bool:
+    """
+    We should only replace cache entries with the responses for newer IDs.
+    Most of the time newer IDs will be the ones with higher value, except when
+    the req_id counter rolls over. We check for this case by checking the
+    distance between the two IDs. If the distance is significant, then it's
+    likely that the req_id counter rolled over, and the smaller id should
+    still be used to replace the one in cache.
+    """
+    diff = abs(id2 - id1)
+    if diff > (INT32_MAX // 2):
+        # Rollover likely occurred. In this case the smaller ID is newer
+        return id1 < id2
+    return id1 > id2
+class ResponseCache:
+    """
+    Cache for blocking method calls. Needed to prevent retried requests from
+    being applied multiple times on the server, for example when the client
+    disconnects. This is used to cache requests/responses sent through
+    unary-unary RPCs to the RayletServicer.
+    Note that no clean up logic is used, the last response for each thread
+    will always be remembered, so at most the cache will hold N entries,
+    where N is the number of threads on the client side. This relies on the
+    assumption that a thread will not make a new blocking request until it has
+    received a response for a previous one, at which point it's safe to
+    overwrite the old response.
+    The high level logic is:
+    1. Before making a call, check the cache for the current thread.
+    2. If present in the cache, check the request id of the cached
+        response.
+        a. If it matches the current request_id, then the request has been
+            received before and we shouldn't re-attempt the logic. Wait for
+            the response to become available in the cache, and then return it
+        b. If it doesn't match, then this is a new request and we can
+            proceed with calling the real stub. While the response is still
+            being generated, temporarily keep (req_id, None) in the cache.
+            Once the call is finished, update the cache entry with the
+            new (req_id, response) pair. Notify other threads that may
+            have been waiting for the response to be prepared.
+    """
+    def __init__(self):
+        self.cv = threading.Condition()
+        self.cache: Dict[int, Tuple[int, Any]] = {}
+    def check_cache(self, thread_id: int, request_id: int) -> Optional[Any]:
+        """
+        Check the cache for a given thread, and see if the entry in the cache
+        matches the current request_id. Returns None if the request_id has
+        not been seen yet, otherwise returns the cached result.
+        Throws an error if the placeholder in the cache doesn't match the
+        request_id -- this means that a new request evicted the old value in
+        the cache, and that the RPC for `request_id` is redundant and the
+        result can be discarded, i.e.:
+        1. Request A is sent (A1)
+        2. Channel disconnects
+        3. Request A is resent (A2)
+        4. A1 is received
+        5. A2 is received, waits for A1 to finish
+        6. A1 finishes and is sent back to client
+        7. Request B is sent
+        8. Request B overwrites cache entry
+        9. A2 wakes up extremely late, but cache is now invalid
+        In practice this is VERY unlikely to happen, but the error can at
+        least serve as a sanity check or catch invalid request id's.
+        """
+        with self.cv:
+            if thread_id in self.cache:
+                cached_request_id, cached_resp = self.cache[thread_id]
+                if cached_request_id == request_id:
+                    while cached_resp is None:
+                        # The call was started, but the response hasn't yet
+                        # been added to the cache. Let go of the lock and
+                        # wait until the response is ready.
+                        self.cv.wait()
+                        cached_request_id, cached_resp = self.cache[thread_id]
+                        if cached_request_id != request_id:
+                            raise RuntimeError(
+                                "Cached response doesn't match the id of the "
+                                "original request. This might happen if this "
+                                "request was received out of order. The "
+                                "result of the caller is no longer needed. "
+                                f"({request_id} != {cached_request_id})"
+                            )
+                    return cached_resp
+                if not _id_is_newer(request_id, cached_request_id):
+                    raise RuntimeError(
+                        "Attempting to replace newer cache entry with older "
+                        "one. This might happen if this request was received "
+                        "out of order. The result of the caller is no "
+                        f"longer needed. ({request_id} != {cached_request_id}"
+                    )
+            self.cache[thread_id] = (request_id, None)
+        return None
+    def update_cache(self, thread_id: int, request_id: int, response: Any) -> None:
+        """
+        Inserts `response` into the cache for `request_id`.
+        """
+        with self.cv:
+            cached_request_id, cached_resp = self.cache[thread_id]
+            if cached_request_id != request_id or cached_resp is not None:
+                # The cache was overwritten by a newer requester between
+                # our call to check_cache and our call to update it.
+                # This can't happen if the assumption that the cached requests
+                # are all blocking on the client side, so if you encounter
+                # this, check if any async requests are being cached.
+                raise RuntimeError(
+                    "Attempting to update the cache, but placeholder's "
+                    "do not match the current request_id. This might happen "
+                    "if this request was received out of order. The result "
+                    f"of the caller is no longer needed. ({request_id} != "
+                    f"{cached_request_id})"
+                )
+            self.cache[thread_id] = (request_id, response)
+            self.cv.notify_all()
+class OrderedResponseCache:
+    """
+    Cache for streaming RPCs, i.e. the DataServicer. Relies on explicit
+    ack's from the client to determine when it can clean up cache entries.
+    """
+    def __init__(self):
+        self.last_received = 0
+        self.cv = threading.Condition()
+        self.cache: Dict[int, Any] = OrderedDict()
+    def check_cache(self, req_id: int) -> Optional[Any]:
+        """
+        Check the cache for a given thread, and see if the entry in the cache
+        matches the current request_id. Returns None if the request_id has
+        not been seen yet, otherwise returns the cached result.
+        """
+        with self.cv:
+            if _id_is_newer(self.last_received, req_id) or self.last_received == req_id:
+                # Request is for an id that has already been cleared from
+                # cache/acknowledged.
+                raise RuntimeError(
+                    "Attempting to accesss a cache entry that has already "
+                    "cleaned up. The client has already acknowledged "
+                    f"receiving this response. ({req_id}, "
+                    f"{self.last_received})"
+                )
+            if req_id in self.cache:
+                cached_resp = self.cache[req_id]
+                while cached_resp is None:
+                    # The call was started, but the response hasn't yet been
+                    # added to the cache. Let go of the lock and wait until
+                    # the response is ready
+                    self.cv.wait()
+                    if req_id not in self.cache:
+                        raise RuntimeError(
+                            "Cache entry was removed. This likely means that "
+                            "the result of this call is no longer needed."
+                        )
+                    cached_resp = self.cache[req_id]
+                return cached_resp
+            self.cache[req_id] = None
+        return None
+    def update_cache(self, req_id: int, resp: Any) -> None:
+        """
+        Inserts `response` into the cache for `request_id`.
+        """
+        with self.cv:
+            self.cv.notify_all()
+            if req_id not in self.cache:
+                raise RuntimeError(
+                    "Attempting to update the cache, but placeholder is "
+                    "missing. This might happen on a redundant call to "
+                    f"update_cache. ({req_id})"
+                )
+            self.cache[req_id] = resp
+    def invalidate(self, e: Exception) -> bool:
+        """
+        Invalidate any partially populated cache entries, replacing their
+        placeholders with the passed in exception. Useful to prevent a thread
+        from waiting indefinitely on a failed call.
+        Returns True if the cache contains an error, False otherwise
+        """
+        with self.cv:
+            invalid = False
+            for req_id in self.cache:
+                if self.cache[req_id] is None:
+                    self.cache[req_id] = e
+                if isinstance(self.cache[req_id], Exception):
+                    invalid = True
+            self.cv.notify_all()
+        return invalid
+    def cleanup(self, last_received: int) -> None:
+        """
+        Cleanup all of the cached requests up to last_received. Assumes that
+        the cache entries were inserted in ascending order.
+        """
+        with self.cv:
+            if _id_is_newer(last_received, self.last_received):
+                self.last_received = last_received
+            to_remove = []
+            for req_id in self.cache:
+                if _id_is_newer(last_received, req_id) or last_received == req_id:
+                    to_remove.append(req_id)
+                else:
+                    break
+            for req_id in to_remove:
+                del self.cache[req_id]
+            self.cv.notify_all()

.venv/lib/python3.11/site-packages/ray/util/client/dataclient.py ADDED Viewed

	@@ -0,0 +1,599 @@

+"""This file implements a threaded stream controller to abstract a data stream
+back to the ray clientserver.
+"""
+import math
+import logging
+import queue
+import threading
+import warnings
+import grpc
+from collections import OrderedDict
+from typing import Any, Callable, Dict, TYPE_CHECKING, Optional, Union
+import ray.core.generated.ray_client_pb2 as ray_client_pb2
+import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
+from ray.util.client.common import (
+    INT32_MAX,
+    OBJECT_TRANSFER_CHUNK_SIZE,
+    OBJECT_TRANSFER_WARNING_SIZE,
+)
+from ray.util.debug import log_once
+if TYPE_CHECKING:
+    from ray.util.client.worker import Worker
+logger = logging.getLogger(__name__)
+ResponseCallable = Callable[[Union[ray_client_pb2.DataResponse, Exception]], None]
+# Send an acknowledge on every 32nd response received
+ACKNOWLEDGE_BATCH_SIZE = 32
+def chunk_put(req: ray_client_pb2.DataRequest):
+    """
+    Chunks a put request. Doing this lazily is important for large objects,
+    since taking slices of bytes objects does a copy. This means if we
+    immediately materialized every chunk of a large object and inserted them
+    into the result_queue, we would effectively double the memory needed
+    on the client to handle the put.
+    """
+    # When accessing a protobuf field, deserialization is performed, which will
+    # generate a copy. So we need to avoid accessing the `data` field multiple
+    # times in the loop
+    request_data = req.put.data
+    total_size = len(request_data)
+    assert total_size > 0, "Cannot chunk object with missing data"
+    if total_size >= OBJECT_TRANSFER_WARNING_SIZE and log_once(
+        "client_object_put_size_warning"
+    ):
+        size_gb = total_size / 2**30
+        warnings.warn(
+            "Ray Client is attempting to send a "
+            f"{size_gb:.2f} GiB object over the network, which may "
+            "be slow. Consider serializing the object and using a remote "
+            "URI to transfer via S3 or Google Cloud Storage instead. "
+            "Documentation for doing this can be found here: "
+            "https://docs.ray.io/en/latest/handling-dependencies.html#remote-uris",
+            UserWarning,
+        )
+    total_chunks = math.ceil(total_size / OBJECT_TRANSFER_CHUNK_SIZE)
+    for chunk_id in range(0, total_chunks):
+        start = chunk_id * OBJECT_TRANSFER_CHUNK_SIZE
+        end = min(total_size, (chunk_id + 1) * OBJECT_TRANSFER_CHUNK_SIZE)
+        chunk = ray_client_pb2.PutRequest(
+            client_ref_id=req.put.client_ref_id,
+            data=request_data[start:end],
+            chunk_id=chunk_id,
+            total_chunks=total_chunks,
+            total_size=total_size,
+            owner_id=req.put.owner_id,
+        )
+        yield ray_client_pb2.DataRequest(req_id=req.req_id, put=chunk)
+def chunk_task(req: ray_client_pb2.DataRequest):
+    """
+    Chunks a client task. Doing this lazily is important with large arguments,
+    since taking slices of bytes objects does a copy. This means if we
+    immediately materialized every chunk of a large argument and inserted them
+    into the result_queue, we would effectively double the memory needed
+    on the client to handle the task.
+    """
+    # When accessing a protobuf field, deserialization is performed, which will
+    # generate a copy. So we need to avoid accessing the `data` field multiple
+    # times in the loop
+    request_data = req.task.data
+    total_size = len(request_data)
+    assert total_size > 0, "Cannot chunk object with missing data"
+    total_chunks = math.ceil(total_size / OBJECT_TRANSFER_CHUNK_SIZE)
+    for chunk_id in range(0, total_chunks):
+        start = chunk_id * OBJECT_TRANSFER_CHUNK_SIZE
+        end = min(total_size, (chunk_id + 1) * OBJECT_TRANSFER_CHUNK_SIZE)
+        chunk = ray_client_pb2.ClientTask(
+            type=req.task.type,
+            name=req.task.name,
+            payload_id=req.task.payload_id,
+            client_id=req.task.client_id,
+            options=req.task.options,
+            baseline_options=req.task.baseline_options,
+            namespace=req.task.namespace,
+            data=request_data[start:end],
+            chunk_id=chunk_id,
+            total_chunks=total_chunks,
+        )
+        yield ray_client_pb2.DataRequest(req_id=req.req_id, task=chunk)
+class ChunkCollector:
+    """
+    This object collects chunks from async get requests via __call__, and
+    calls the underlying callback when the object is fully received, or if an
+    exception while retrieving the object occurs.
+    This is not used in synchronous gets (synchronous gets interact with the
+    raylet servicer directly, not through the datapath).
+    __call__ returns true once the underlying call back has been called.
+    """
+    def __init__(self, callback: ResponseCallable, request: ray_client_pb2.DataRequest):
+        # Bytearray containing data received so far
+        self.data = bytearray()
+        # The callback that will be called once all data is received
+        self.callback = callback
+        # The id of the last chunk we've received, or -1 if haven't seen any yet
+        self.last_seen_chunk = -1
+        # The GetRequest that initiated the transfer. start_chunk_id will be
+        # updated as chunks are received to avoid re-requesting chunks that
+        # we've already received.
+        self.request = request
+    def __call__(self, response: Union[ray_client_pb2.DataResponse, Exception]) -> bool:
+        if isinstance(response, Exception):
+            self.callback(response)
+            return True
+        get_resp = response.get
+        if not get_resp.valid:
+            self.callback(response)
+            return True
+        if get_resp.total_size > OBJECT_TRANSFER_WARNING_SIZE and log_once(
+            "client_object_transfer_size_warning"
+        ):
+            size_gb = get_resp.total_size / 2**30
+            warnings.warn(
+                "Ray Client is attempting to retrieve a "
+                f"{size_gb:.2f} GiB object over the network, which may "
+                "be slow. Consider serializing the object to a file and "
+                "using rsync or S3 instead.",
+                UserWarning,
+            )
+        chunk_data = get_resp.data
+        chunk_id = get_resp.chunk_id
+        if chunk_id == self.last_seen_chunk + 1:
+            self.data.extend(chunk_data)
+            self.last_seen_chunk = chunk_id
+            # If we disconnect partway through, restart the get request
+            # at the first chunk we haven't seen
+            self.request.get.start_chunk_id = self.last_seen_chunk + 1
+        elif chunk_id > self.last_seen_chunk + 1:
+            # A chunk was skipped. This shouldn't happen in practice since
+            # grpc guarantees that chunks will arrive in order.
+            msg = (
+                f"Received chunk {chunk_id} when we expected "
+                f"{self.last_seen_chunk + 1} for request {response.req_id}"
+            )
+            logger.warning(msg)
+            self.callback(RuntimeError(msg))
+            return True
+        else:
+            # We received a chunk that've already seen before. Ignore, since
+            # it should already be appended to self.data.
+            logger.debug(
+                f"Received a repeated chunk {chunk_id} "
+                f"from request {response.req_id}."
+            )
+        if get_resp.chunk_id == get_resp.total_chunks - 1:
+            self.callback(self.data)
+            return True
+        else:
+            # Not done yet
+            return False
+class DataClient:
+    def __init__(self, client_worker: "Worker", client_id: str, metadata: list):
+        """Initializes a thread-safe datapath over a Ray Client gRPC channel.
+        Args:
+            client_worker: The Ray Client worker that manages this client
+            client_id: the generated ID representing this client
+            metadata: metadata to pass to gRPC requests
+        """
+        self.client_worker = client_worker
+        self._client_id = client_id
+        self._metadata = metadata
+        self.data_thread = self._start_datathread()
+        # Track outstanding requests to resend in case of disconnection
+        self.outstanding_requests: Dict[int, Any] = OrderedDict()
+        # Serialize access to all mutable internal states: self.request_queue,
+        # self.ready_data, self.asyncio_waiting_data,
+        # self._in_shutdown, self._req_id, self.outstanding_requests and
+        # calling self._next_id()
+        self.lock = threading.Lock()
+        # Waiting for response or shutdown.
+        self.cv = threading.Condition(lock=self.lock)
+        self.request_queue = self._create_queue()
+        self.ready_data: Dict[int, Any] = {}
+        # NOTE: Dictionary insertion is guaranteed to complete before lookup
+        # and/or removal because of synchronization via the request_queue.
+        self.asyncio_waiting_data: Dict[int, ResponseCallable] = {}
+        self._in_shutdown = False
+        self._req_id = 0
+        self._last_exception = None
+        self._acknowledge_counter = 0
+        self.data_thread.start()
+    # Must hold self.lock when calling this function.
+    def _next_id(self) -> int:
+        assert self.lock.locked()
+        self._req_id += 1
+        if self._req_id > INT32_MAX:
+            self._req_id = 1
+        # Responses that aren't tracked (like opportunistic releases)
+        # have req_id=0, so make sure we never mint such an id.
+        assert self._req_id != 0
+        return self._req_id
+    def _start_datathread(self) -> threading.Thread:
+        return threading.Thread(
+            target=self._data_main,
+            name="ray_client_streaming_rpc",
+            args=(),
+            daemon=True,
+        )
+    # A helper that takes requests from queue. If the request wraps a PutRequest,
+    # lazily chunks and yields the request. Otherwise, yields the request directly.
+    def _requests(self):
+        while True:
+            req = self.request_queue.get()
+            if req is None:
+                # Stop when client signals shutdown.
+                return
+            req_type = req.WhichOneof("type")
+            if req_type == "put":
+                yield from chunk_put(req)
+            elif req_type == "task":
+                yield from chunk_task(req)
+            else:
+                yield req
+    def _data_main(self) -> None:
+        reconnecting = False
+        try:
+            while not self.client_worker._in_shutdown:
+                stub = ray_client_pb2_grpc.RayletDataStreamerStub(
+                    self.client_worker.channel
+                )
+                metadata = self._metadata + [("reconnecting", str(reconnecting))]
+                resp_stream = stub.Datapath(
+                    self._requests(),
+                    metadata=metadata,
+                    wait_for_ready=True,
+                )
+                try:
+                    for response in resp_stream:
+                        self._process_response(response)
+                    return
+                except grpc.RpcError as e:
+                    reconnecting = self._can_reconnect(e)
+                    if not reconnecting:
+                        self._last_exception = e
+                        return
+                    self._reconnect_channel()
+        except Exception as e:
+            self._last_exception = e
+        finally:
+            logger.debug("Shutting down data channel.")
+            self._shutdown()
+    def _process_response(self, response: Any) -> None:
+        """
+        Process responses from the data servicer.
+        """
+        if response.req_id == 0:
+            # This is not being waited for.
+            logger.debug(f"Got unawaited response {response}")
+            return
+        if response.req_id in self.asyncio_waiting_data:
+            can_remove = True
+            try:
+                callback = self.asyncio_waiting_data[response.req_id]
+                if isinstance(callback, ChunkCollector):
+                    can_remove = callback(response)
+                elif callback:
+                    callback(response)
+                if can_remove:
+                    # NOTE: calling del self.asyncio_waiting_data results
+                    # in the destructor of ClientObjectRef running, which
+                    # calls ReleaseObject(). So self.asyncio_waiting_data
+                    # is accessed without holding self.lock. Holding the
+                    # lock shouldn't be necessary either.
+                    del self.asyncio_waiting_data[response.req_id]
+            except Exception:
+                logger.exception("Callback error:")
+            with self.lock:
+                # Update outstanding requests
+                if response.req_id in self.outstanding_requests and can_remove:
+                    del self.outstanding_requests[response.req_id]
+                    # Acknowledge response
+                    self._acknowledge(response.req_id)
+        else:
+            with self.lock:
+                self.ready_data[response.req_id] = response
+                self.cv.notify_all()
+    def _can_reconnect(self, e: grpc.RpcError) -> bool:
+        """
+        Processes RPC errors that occur while reading from data stream.
+        Returns True if the error can be recovered from, False otherwise.
+        """
+        if not self.client_worker._can_reconnect(e):
+            logger.error("Unrecoverable error in data channel.")
+            logger.debug(e)
+            return False
+        logger.debug("Recoverable error in data channel.")
+        logger.debug(e)
+        return True
+    def _shutdown(self) -> None:
+        """
+        Shutdown the data channel
+        """
+        with self.lock:
+            self._in_shutdown = True
+            self.cv.notify_all()
+            callbacks = self.asyncio_waiting_data.values()
+            self.asyncio_waiting_data = {}
+        if self._last_exception:
+            # Abort async requests with the error.
+            err = ConnectionError(
+                "Failed during this or a previous request. Exception that "
+                f"broke the connection: {self._last_exception}"
+            )
+        else:
+            err = ConnectionError(
+                "Request cannot be fulfilled because the data client has "
+                "disconnected."
+            )
+        for callback in callbacks:
+            if callback:
+                callback(err)
+        # Since self._in_shutdown is set to True, no new item
+        # will be added to self.asyncio_waiting_data
+    def _acknowledge(self, req_id: int) -> None:
+        """
+        Puts an acknowledge request on the request queue periodically.
+        Lock should be held before calling this. Used when an async or
+        blocking response is received.
+        """
+        if not self.client_worker._reconnect_enabled:
+            # Skip ACKs if reconnect isn't enabled
+            return
+        assert self.lock.locked()
+        self._acknowledge_counter += 1
+        if self._acknowledge_counter % ACKNOWLEDGE_BATCH_SIZE == 0:
+            self.request_queue.put(
+                ray_client_pb2.DataRequest(
+                    acknowledge=ray_client_pb2.AcknowledgeRequest(req_id=req_id)
+                )
+            )
+    def _reconnect_channel(self) -> None:
+        """
+        Attempts to reconnect the gRPC channel and resend outstanding
+        requests. First, the server is pinged to see if the current channel
+        still works. If the ping fails, then the current channel is closed
+        and replaced with a new one.
+        Once a working channel is available, a new request queue is made
+        and filled with any outstanding requests to be resent to the server.
+        """
+        try:
+            # Ping the server to see if the current channel is reuseable, for
+            # example if gRPC reconnected the channel on its own or if the
+            # RPC error was transient and the channel is still open
+            ping_succeeded = self.client_worker.ping_server(timeout=5)
+        except grpc.RpcError:
+            ping_succeeded = False
+        if not ping_succeeded:
+            # Ping failed, try refreshing the data channel
+            logger.warning(
+                "Encountered connection issues in the data channel. "
+                "Attempting to reconnect."
+            )
+            try:
+                self.client_worker._connect_channel(reconnecting=True)
+            except ConnectionError:
+                logger.warning("Failed to reconnect the data channel")
+                raise
+            logger.debug("Reconnection succeeded!")
+        # Recreate the request queue, and resend outstanding requests
+        with self.lock:
+            self.request_queue = self._create_queue()
+            for request in self.outstanding_requests.values():
+                # Resend outstanding requests
+                self.request_queue.put(request)
+    # Use SimpleQueue to avoid deadlocks when appending to queue from __del__()
+    @staticmethod
+    def _create_queue():
+        return queue.SimpleQueue()
+    def close(self) -> None:
+        thread = None
+        with self.lock:
+            self._in_shutdown = True
+            # Notify blocking operations to fail.
+            self.cv.notify_all()
+            # Add sentinel to terminate streaming RPC.
+            if self.request_queue is not None:
+                # Intentional shutdown, tell server it can clean up the
+                # connection immediately and ignore the reconnect grace period.
+                cleanup_request = ray_client_pb2.DataRequest(
+                    connection_cleanup=ray_client_pb2.ConnectionCleanupRequest()
+                )
+                self.request_queue.put(cleanup_request)
+                self.request_queue.put(None)
+            if self.data_thread is not None:
+                thread = self.data_thread
+        # Wait until streaming RPCs are done.
+        if thread is not None:
+            thread.join()
+    def _blocking_send(
+        self, req: ray_client_pb2.DataRequest
+    ) -> ray_client_pb2.DataResponse:
+        with self.lock:
+            self._check_shutdown()
+            req_id = self._next_id()
+            req.req_id = req_id
+            self.request_queue.put(req)
+            self.outstanding_requests[req_id] = req
+            self.cv.wait_for(lambda: req_id in self.ready_data or self._in_shutdown)
+            self._check_shutdown()
+            data = self.ready_data[req_id]
+            del self.ready_data[req_id]
+            del self.outstanding_requests[req_id]
+            self._acknowledge(req_id)
+        return data
+    def _async_send(
+        self,
+        req: ray_client_pb2.DataRequest,
+        callback: Optional[ResponseCallable] = None,
+    ) -> None:
+        with self.lock:
+            self._check_shutdown()
+            req_id = self._next_id()
+            req.req_id = req_id
+            self.asyncio_waiting_data[req_id] = callback
+            self.outstanding_requests[req_id] = req
+            self.request_queue.put(req)
+    # Must hold self.lock when calling this function.
+    def _check_shutdown(self):
+        assert self.lock.locked()
+        if not self._in_shutdown:
+            return
+        self.lock.release()
+        # Do not try disconnect() or throw exceptions in self.data_thread.
+        # Otherwise deadlock can occur.
+        if threading.current_thread().ident == self.data_thread.ident:
+            return
+        from ray.util import disconnect
+        disconnect()
+        self.lock.acquire()
+        if self._last_exception is not None:
+            msg = (
+                "Request can't be sent because the Ray client has already "
+                "been disconnected due to an error. Last exception: "
+                f"{self._last_exception}"
+            )
+        else:
+            msg = (
+                "Request can't be sent because the Ray client has already "
+                "been disconnected."
+            )
+        raise ConnectionError(msg)
+    def Init(
+        self, request: ray_client_pb2.InitRequest, context=None
+    ) -> ray_client_pb2.InitResponse:
+        datareq = ray_client_pb2.DataRequest(
+            init=request,
+        )
+        resp = self._blocking_send(datareq)
+        return resp.init
+    def PrepRuntimeEnv(
+        self, request: ray_client_pb2.PrepRuntimeEnvRequest, context=None
+    ) -> ray_client_pb2.PrepRuntimeEnvResponse:
+        datareq = ray_client_pb2.DataRequest(
+            prep_runtime_env=request,
+        )
+        resp = self._blocking_send(datareq)
+        return resp.prep_runtime_env
+    def ConnectionInfo(self, context=None) -> ray_client_pb2.ConnectionInfoResponse:
+        datareq = ray_client_pb2.DataRequest(
+            connection_info=ray_client_pb2.ConnectionInfoRequest()
+        )
+        resp = self._blocking_send(datareq)
+        return resp.connection_info
+    def GetObject(
+        self, request: ray_client_pb2.GetRequest, context=None
+    ) -> ray_client_pb2.GetResponse:
+        datareq = ray_client_pb2.DataRequest(
+            get=request,
+        )
+        resp = self._blocking_send(datareq)
+        return resp.get
+    def RegisterGetCallback(
+        self, request: ray_client_pb2.GetRequest, callback: ResponseCallable
+    ) -> None:
+        if len(request.ids) != 1:
+            raise ValueError(
+                "RegisterGetCallback() must have exactly 1 Object ID. "
+                f"Actual: {request}"
+            )
+        datareq = ray_client_pb2.DataRequest(
+            get=request,
+        )
+        collector = ChunkCollector(callback=callback, request=datareq)
+        self._async_send(datareq, collector)
+    # TODO: convert PutObject to async
+    def PutObject(
+        self, request: ray_client_pb2.PutRequest, context=None
+    ) -> ray_client_pb2.PutResponse:
+        datareq = ray_client_pb2.DataRequest(
+            put=request,
+        )
+        resp = self._blocking_send(datareq)
+        return resp.put
+    def ReleaseObject(
+        self, request: ray_client_pb2.ReleaseRequest, context=None
+    ) -> None:
+        datareq = ray_client_pb2.DataRequest(
+            release=request,
+        )
+        self._async_send(datareq)
+    def Schedule(self, request: ray_client_pb2.ClientTask, callback: ResponseCallable):
+        datareq = ray_client_pb2.DataRequest(task=request)
+        self._async_send(datareq, callback)
+    def Terminate(
+        self, request: ray_client_pb2.TerminateRequest
+    ) -> ray_client_pb2.TerminateResponse:
+        req = ray_client_pb2.DataRequest(
+            terminate=request,
+        )
+        resp = self._blocking_send(req)
+        return resp.terminate
+    def ListNamedActors(
+        self, request: ray_client_pb2.ClientListNamedActorsRequest
+    ) -> ray_client_pb2.ClientListNamedActorsResponse:
+        req = ray_client_pb2.DataRequest(
+            list_named_actors=request,
+        )
+        resp = self._blocking_send(req)
+        return resp.list_named_actors

.venv/lib/python3.11/site-packages/ray/util/client/options.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Any
+from typing import Dict
+from typing import Optional
+from ray._private import ray_option_utils
+from ray.util.placement_group import PlacementGroup, check_placement_group_index
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+def validate_options(kwargs_dict: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    if kwargs_dict is None:
+        return None
+    if len(kwargs_dict) == 0:
+        return None
+    out = {}
+    for k, v in kwargs_dict.items():
+        if k not in ray_option_utils.valid_options:
+            raise ValueError(
+                f"Invalid option keyword: '{k}'. "
+                f"{ray_option_utils.remote_args_error_string}"
+            )
+        ray_option_utils.valid_options[k].validate(k, v)
+        out[k] = v
+    # Validate placement setting similar to the logic in ray/actor.py and
+    # ray/remote_function.py. The difference is that when
+    # placement_group = default and placement_group_capture_child_tasks
+    # specified, placement group cannot be resolved at client. So this check
+    # skips this case and relies on server to enforce any condition.
+    bundle_index = out.get("placement_group_bundle_index", None)
+    pg = out.get("placement_group", None)
+    scheduling_strategy = out.get("scheduling_strategy", None)
+    if isinstance(scheduling_strategy, PlacementGroupSchedulingStrategy):
+        pg = scheduling_strategy.placement_group
+        bundle_index = scheduling_strategy.placement_group_bundle_index
+    if bundle_index is not None:
+        if pg is None:
+            pg = PlacementGroup.empty()
+        if pg == "default" and (
+            out.get("placement_group_capture_child_tasks", None) is None
+        ):
+            pg = PlacementGroup.empty()
+        if isinstance(pg, PlacementGroup):
+            check_placement_group_index(pg, bundle_index)
+    return out

.venv/lib/python3.11/site-packages/ray/util/client/ray_client_helpers.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from contextlib import contextmanager
+import time
+from typing import Any, Dict
+import ray as real_ray
+from ray.job_config import JobConfig
+import ray.util.client.server.server as ray_client_server
+from ray.util.client import ray
+from ray._private.client_mode_hook import enable_client_mode, disable_client_hook
+@contextmanager
+def ray_start_client_server(metadata=None, ray_connect_handler=None, **kwargs):
+    with ray_start_client_server_pair(
+        metadata=metadata, ray_connect_handler=ray_connect_handler, **kwargs
+    ) as pair:
+        client, server = pair
+        yield client
+@contextmanager
+def ray_start_client_server_for_address(address):
+    """
+    Starts a Ray client server that initializes drivers at the specified address.
+    """
+    def connect_handler(
+        job_config: JobConfig = None, **ray_init_kwargs: Dict[str, Any]
+    ):
+        import ray
+        with disable_client_hook():
+            if not ray.is_initialized():
+                return ray.init(address, job_config=job_config, **ray_init_kwargs)
+    with ray_start_client_server(ray_connect_handler=connect_handler) as ray:
+        yield ray
+@contextmanager
+def ray_start_client_server_pair(metadata=None, ray_connect_handler=None, **kwargs):
+    ray._inside_client_test = True
+    with disable_client_hook():
+        assert not ray.is_initialized()
+    server = ray_client_server.serve(
+        "127.0.0.1:50051", ray_connect_handler=ray_connect_handler
+    )
+    ray.connect("127.0.0.1:50051", metadata=metadata, **kwargs)
+    try:
+        yield ray, server
+    finally:
+        ray._inside_client_test = False
+        ray.disconnect()
+        server.stop(0)
+        del server
+        start = time.monotonic()
+        with disable_client_hook():
+            while ray.is_initialized():
+                time.sleep(1)
+                if time.monotonic() - start > 30:
+                    raise RuntimeError("Failed to terminate Ray")
+        # Allow windows to close processes before moving on
+        time.sleep(3)
+@contextmanager
+def ray_start_cluster_client_server_pair(address):
+    ray._inside_client_test = True
+    def ray_connect_handler(job_config=None, **ray_init_kwargs):
+        real_ray.init(address=address)
+    server = ray_client_server.serve(
+        "127.0.0.1:50051", ray_connect_handler=ray_connect_handler
+    )
+    ray.connect("127.0.0.1:50051")
+    try:
+        yield ray, server
+    finally:
+        ray._inside_client_test = False
+        ray.disconnect()
+        server.stop(0)
+@contextmanager
+def connect_to_client_or_not(connect_to_client: bool):
+    """Utility for running test logic with and without a Ray client connection.
+    If client_connect is True, will connect to Ray client in context.
+    If client_connect is False, does nothing.
+    How to use:
+    Given a test of the following form:
+    def test_<name>(args):
+        <initialize a ray cluster>
+        <use the ray cluster>
+    Modify the test to
+    @pytest.mark.parametrize("connect_to_client", [False, True])
+    def test_<name>(args, connect_to_client)
+    <initialize a ray cluster>
+    with connect_to_client_or_not(connect_to_client):
+        <use the ray cluster>
+    Parameterize the argument connect over True, False to run the test with and
+    without a Ray client connection.
+    """
+    if connect_to_client:
+        with ray_start_client_server(namespace=""), enable_client_mode():
+            yield
+    else:
+        yield

.venv/lib/python3.11/site-packages/ray/util/client/runtime_context.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import TYPE_CHECKING
+from types import SimpleNamespace
+if TYPE_CHECKING:
+    from ray import JobID, NodeID
+    from ray.runtime_context import RuntimeContext
+class _ClientWorkerPropertyAPI:
+    """Emulates the properties of the ray._private.worker object for the client"""
+    def __init__(self, worker):
+        assert worker is not None
+        self.worker = worker
+    def build_runtime_context(self) -> "RuntimeContext":
+        """Creates a RuntimeContext backed by the properites of this API"""
+        # Defer the import of RuntimeContext until needed to avoid cycles
+        from ray.runtime_context import RuntimeContext
+        return RuntimeContext(self)
+    def _fetch_runtime_context(self):
+        import ray.core.generated.ray_client_pb2 as ray_client_pb2
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.RUNTIME_CONTEXT
+        )
+    @property
+    def mode(self):
+        from ray._private.worker import SCRIPT_MODE
+        return SCRIPT_MODE
+    @property
+    def current_job_id(self) -> "JobID":
+        from ray import JobID
+        return JobID(self._fetch_runtime_context().job_id)
+    @property
+    def current_node_id(self) -> "NodeID":
+        from ray import NodeID
+        return NodeID(self._fetch_runtime_context().node_id)
+    @property
+    def namespace(self) -> str:
+        return self._fetch_runtime_context().namespace
+    @property
+    def should_capture_child_tasks_in_placement_group(self) -> bool:
+        return self._fetch_runtime_context().capture_client_tasks
+    @property
+    def runtime_env(self) -> str:
+        return self._fetch_runtime_context().runtime_env
+    def check_connected(self) -> bool:
+        return self.worker.ping_server()
+    @property
+    def gcs_client(self) -> str:
+        return SimpleNamespace(address=self._fetch_runtime_context().gcs_address)

.venv/lib/python3.11/site-packages/ray/util/client/worker.py ADDED Viewed

	@@ -0,0 +1,908 @@

+"""This file includes the Worker class which sits on the client side.
+It implements the Ray API functions that are forwarded through grpc calls
+to the server.
+"""
+import base64
+import json
+import logging
+import os
+import tempfile
+import threading
+import time
+import uuid
+import warnings
+from collections import defaultdict
+from concurrent.futures import Future
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+import grpc
+import ray._private.tls_utils
+import ray.cloudpickle as cloudpickle
+import ray.core.generated.ray_client_pb2 as ray_client_pb2
+import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
+from ray._private.ray_constants import DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD
+from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
+from ray._private.runtime_env.working_dir import upload_working_dir_if_needed
+# Use cloudpickle's version of pickle for UnpicklingError
+from ray.cloudpickle.compat import pickle
+from ray.exceptions import GetTimeoutError
+from ray.job_config import JobConfig
+from ray.util.client.client_pickler import dumps_from_client, loads_from_server
+from ray.util.client.common import (
+    GRPC_OPTIONS,
+    GRPC_UNRECOVERABLE_ERRORS,
+    INT32_MAX,
+    OBJECT_TRANSFER_WARNING_SIZE,
+    ClientActorClass,
+    ClientActorHandle,
+    ClientActorRef,
+    ClientObjectRef,
+    ClientRemoteFunc,
+    ClientStub,
+)
+from ray.util.client.dataclient import DataClient
+from ray.util.client.logsclient import LogstreamClient
+from ray.util.debug import log_once
+if TYPE_CHECKING:
+    from ray.actor import ActorClass
+    from ray.remote_function import RemoteFunction
+logger = logging.getLogger(__name__)
+INITIAL_TIMEOUT_SEC = 5
+MAX_TIMEOUT_SEC = 30
+# The max amount of time an operation can run blocking in the server. This
+# allows for Ctrl-C of the client to work without explicitly cancelling server
+# operations.
+MAX_BLOCKING_OPERATION_TIME_S: float = 2.0
+# If the total size (bytes) of all outbound messages to schedule tasks since
+# the connection began exceeds this value, a warning should be raised
+MESSAGE_SIZE_THRESHOLD = 10 * 2**20  # 10 MB
+# Links to the Ray Design Pattern doc to use in the task overhead warning
+# message
+DESIGN_PATTERN_FINE_GRAIN_TASKS_LINK = "https://docs.google.com/document/d/167rnnDFIVRhHhK4mznEIemOtj63IOhtIPvSYaPgI4Fg/edit#heading=h.f7ins22n6nyl"  # noqa E501
+DESIGN_PATTERN_LARGE_OBJECTS_LINK = "https://docs.google.com/document/d/167rnnDFIVRhHhK4mznEIemOtj63IOhtIPvSYaPgI4Fg/edit#heading=h.1afmymq455wu"  # noqa E501
+def backoff(timeout: int) -> int:
+    timeout = timeout + 5
+    if timeout > MAX_TIMEOUT_SEC:
+        timeout = MAX_TIMEOUT_SEC
+    return timeout
+class Worker:
+    def __init__(
+        self,
+        conn_str: str = "",
+        secure: bool = False,
+        metadata: List[Tuple[str, str]] = None,
+        connection_retries: int = 3,
+        _credentials: Optional[grpc.ChannelCredentials] = None,
+    ):
+        """Initializes the worker side grpc client.
+        Args:
+            conn_str: The host:port connection string for the ray server.
+            secure: whether to use SSL secure channel or not.
+            metadata: additional metadata passed in the grpc request headers.
+            connection_retries: Number of times to attempt to reconnect to the
+              ray server if it doesn't respond immediately. Setting to 0 tries
+              at least once.  For infinite retries, catch the ConnectionError
+              exception.
+            _credentials: gprc channel credentials. Default ones will be used
+              if None.
+        """
+        self._client_id = make_client_id()
+        self.metadata = [("client_id", self._client_id)] + (
+            metadata if metadata else []
+        )
+        self.channel = None
+        self.server = None
+        self._conn_state = grpc.ChannelConnectivity.IDLE
+        self._converted: Dict[str, ClientStub] = {}
+        self._secure = secure or os.environ.get("RAY_USE_TLS", "0").lower() in (
+            "1",
+            "true",
+        )
+        self._conn_str = conn_str
+        self._connection_retries = connection_retries
+        if _credentials is not None:
+            self._credentials = _credentials
+            self._secure = True
+        else:
+            self._credentials = None
+        self._reconnect_grace_period = DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD
+        if "RAY_CLIENT_RECONNECT_GRACE_PERIOD" in os.environ:
+            # Use value in environment variable if available
+            self._reconnect_grace_period = int(
+                os.environ["RAY_CLIENT_RECONNECT_GRACE_PERIOD"]
+            )
+        # Disable retries if grace period is set to 0
+        self._reconnect_enabled = self._reconnect_grace_period != 0
+        # Set to True when the connection cannot be recovered and reconnect
+        # attempts should be stopped
+        self._in_shutdown = False
+        # Set to True after initial connection succeeds
+        self._has_connected = False
+        self._connect_channel()
+        self._has_connected = True
+        # Has Ray been initialized on the server?
+        self._serverside_ray_initialized = False
+        # Initialize the streams to finish protocol negotiation.
+        self.data_client = DataClient(self, self._client_id, self.metadata)
+        self.reference_count: Dict[bytes, int] = defaultdict(int)
+        self.log_client = LogstreamClient(self, self.metadata)
+        self.log_client.set_logstream_level(logging.INFO)
+        self.closed = False
+        # Track this value to raise a warning if a lot of data are transferred.
+        self.total_outbound_message_size_bytes = 0
+        # Used to create unique IDs for RPCs to the RayletServicer
+        self._req_id_lock = threading.Lock()
+        self._req_id = 0
+    def _connect_channel(self, reconnecting=False) -> None:
+        """
+        Attempts to connect to the server specified by conn_str. If
+        reconnecting after an RPC error, cleans up the old channel and
+        continues to attempt to connect until the grace period is over.
+        """
+        if self.channel is not None:
+            self.channel.unsubscribe(self._on_channel_state_change)
+            self.channel.close()
+        if self._secure:
+            if self._credentials is not None:
+                credentials = self._credentials
+            elif os.environ.get("RAY_USE_TLS", "0").lower() in ("1", "true"):
+                (
+                    server_cert_chain,
+                    private_key,
+                    ca_cert,
+                ) = ray._private.tls_utils.load_certs_from_env()
+                credentials = grpc.ssl_channel_credentials(
+                    certificate_chain=server_cert_chain,
+                    private_key=private_key,
+                    root_certificates=ca_cert,
+                )
+            else:
+                credentials = grpc.ssl_channel_credentials()
+            self.channel = grpc.secure_channel(
+                self._conn_str, credentials, options=GRPC_OPTIONS
+            )
+        else:
+            self.channel = grpc.insecure_channel(self._conn_str, options=GRPC_OPTIONS)
+        self.channel.subscribe(self._on_channel_state_change)
+        # Retry the connection until the channel responds to something
+        # looking like a gRPC connection, though it may be a proxy.
+        start_time = time.time()
+        conn_attempts = 0
+        timeout = INITIAL_TIMEOUT_SEC
+        service_ready = False
+        while conn_attempts < max(self._connection_retries, 1) or reconnecting:
+            conn_attempts += 1
+            if self._in_shutdown:
+                # User manually closed the worker before connection finished
+                break
+            elapsed_time = time.time() - start_time
+            if reconnecting and elapsed_time > self._reconnect_grace_period:
+                self._in_shutdown = True
+                raise ConnectionError(
+                    "Failed to reconnect within the reconnection grace period "
+                    f"({self._reconnect_grace_period}s)"
+                )
+            try:
+                # Let gRPC wait for us to see if the channel becomes ready.
+                # If it throws, we couldn't connect.
+                grpc.channel_ready_future(self.channel).result(timeout=timeout)
+                # The HTTP2 channel is ready. Wrap the channel with the
+                # RayletDriverStub, allowing for unary requests.
+                self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel)
+                service_ready = bool(self.ping_server())
+                if service_ready:
+                    break
+                # Ray is not ready yet, wait a timeout
+                time.sleep(timeout)
+            except grpc.FutureTimeoutError:
+                logger.debug(f"Couldn't connect channel in {timeout} seconds, retrying")
+                # Note that channel_ready_future constitutes its own timeout,
+                # which is why we do not sleep here.
+            except grpc.RpcError as e:
+                logger.debug(
+                    "Ray client server unavailable, " f"retrying in {timeout}s..."
+                )
+                logger.debug(f"Received when checking init: {e.details()}")
+                # Ray is not ready yet, wait a timeout.
+                time.sleep(timeout)
+            # Fallthrough, backoff, and retry at the top of the loop
+            logger.debug(
+                "Waiting for Ray to become ready on the server, "
+                f"retry in {timeout}s..."
+            )
+            if not reconnecting:
+                # Don't increase backoff when trying to reconnect --
+                # we already know the server exists, attempt to reconnect
+                # as soon as we can
+                timeout = backoff(timeout)
+        # If we made it through the loop without service_ready
+        # it means we've used up our retries and
+        # should error back to the user.
+        if not service_ready:
+            self._in_shutdown = True
+            if log_once("ray_client_security_groups"):
+                warnings.warn(
+                    "Ray Client connection timed out. Ensure that "
+                    "the Ray Client port on the head node is reachable "
+                    "from your local machine. See https://docs.ray.io/en"
+                    "/latest/cluster/ray-client.html#step-2-check-ports for "
+                    "more information."
+                )
+            raise ConnectionError("ray client connection timeout")
+    def _can_reconnect(self, e: grpc.RpcError) -> bool:
+        """
+        Returns True if the RPC error can be recovered from and a retry is
+        appropriate, false otherwise.
+        """
+        if not self._reconnect_enabled:
+            return False
+        if self._in_shutdown:
+            # Channel is being shutdown, don't try to reconnect
+            return False
+        if e.code() in GRPC_UNRECOVERABLE_ERRORS:
+            # Unrecoverable error -- These errors are specifically raised
+            # by the server's application logic
+            return False
+        if e.code() == grpc.StatusCode.INTERNAL:
+            details = e.details()
+            if details == "Exception serializing request!":
+                # The client failed tried to send a bad request (for example,
+                # passing "None" instead of a valid grpc message). Don't
+                # try to reconnect/retry.
+                return False
+        # All other errors can be treated as recoverable
+        return True
+    def _call_stub(self, stub_name: str, *args, **kwargs) -> Any:
+        """
+        Calls the stub specified by stub_name (Schedule, WaitObject, etc...).
+        If a recoverable error occurrs while calling the stub, attempts to
+        retry the RPC.
+        """
+        while not self._in_shutdown:
+            try:
+                return getattr(self.server, stub_name)(*args, **kwargs)
+            except grpc.RpcError as e:
+                if self._can_reconnect(e):
+                    time.sleep(0.5)
+                    continue
+                raise
+            except ValueError:
+                # Trying to use the stub on a cancelled channel will raise
+                # ValueError. This should only happen when the data client
+                # is attempting to reset the connection -- sleep and try
+                # again.
+                time.sleep(0.5)
+                continue
+        raise ConnectionError("Client is shutting down.")
+    def _get_object_iterator(
+        self, req: ray_client_pb2.GetRequest, *args, **kwargs
+    ) -> Any:
+        """
+        Calls the stub for GetObject on the underlying server stub. If a
+        recoverable error occurs while streaming the response, attempts
+        to retry the get starting from the first chunk that hasn't been
+        received.
+        """
+        last_seen_chunk = -1
+        while not self._in_shutdown:
+            # If we disconnect partway through, restart the get request
+            # at the first chunk we haven't seen
+            req.start_chunk_id = last_seen_chunk + 1
+            try:
+                for chunk in self.server.GetObject(req, *args, **kwargs):
+                    if chunk.chunk_id <= last_seen_chunk:
+                        # Ignore repeat chunks
+                        logger.debug(
+                            f"Received a repeated chunk {chunk.chunk_id} "
+                            f"from request {req.req_id}."
+                        )
+                        continue
+                    if last_seen_chunk + 1 != chunk.chunk_id:
+                        raise RuntimeError(
+                            f"Received chunk {chunk.chunk_id} when we expected "
+                            f"{self.last_seen_chunk + 1}"
+                        )
+                    last_seen_chunk = chunk.chunk_id
+                    yield chunk
+                    if last_seen_chunk == chunk.total_chunks - 1:
+                        # We've yielded the last chunk, exit early
+                        return
+                return
+            except grpc.RpcError as e:
+                if self._can_reconnect(e):
+                    time.sleep(0.5)
+                    continue
+                raise
+            except ValueError:
+                # Trying to use the stub on a cancelled channel will raise
+                # ValueError. This should only happen when the data client
+                # is attempting to reset the connection -- sleep and try
+                # again.
+                time.sleep(0.5)
+                continue
+        raise ConnectionError("Client is shutting down.")
+    def _add_ids_to_metadata(self, metadata: Any):
+        """
+        Adds a unique req_id and the current thread's identifier to the
+        metadata. These values are useful for preventing mutating operations
+        from being replayed on the server side in the event that the client
+        must retry a requsest.
+        Args:
+            metadata - the gRPC metadata to append the IDs to
+        """
+        if not self._reconnect_enabled:
+            # IDs not needed if the reconnects are disabled
+            return metadata
+        thread_id = str(threading.get_ident())
+        with self._req_id_lock:
+            self._req_id += 1
+            if self._req_id > INT32_MAX:
+                self._req_id = 1
+            req_id = str(self._req_id)
+        return metadata + [("thread_id", thread_id), ("req_id", req_id)]
+    def _on_channel_state_change(self, conn_state: grpc.ChannelConnectivity):
+        logger.debug(f"client gRPC channel state change: {conn_state}")
+        self._conn_state = conn_state
+    def connection_info(self):
+        try:
+            data = self.data_client.ConnectionInfo()
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+        return {
+            "num_clients": data.num_clients,
+            "python_version": data.python_version,
+            "ray_version": data.ray_version,
+            "ray_commit": data.ray_commit,
+        }
+    def register_callback(
+        self,
+        ref: ClientObjectRef,
+        callback: Callable[[ray_client_pb2.DataResponse], None],
+    ) -> None:
+        req = ray_client_pb2.GetRequest(ids=[ref.id], asynchronous=True)
+        self.data_client.RegisterGetCallback(req, callback)
+    def get(self, vals, *, timeout: Optional[float] = None) -> Any:
+        if isinstance(vals, list):
+            if not vals:
+                return []
+            to_get = vals
+        elif isinstance(vals, ClientObjectRef):
+            to_get = [vals]
+        else:
+            raise Exception(
+                "Can't get something that's not a "
+                "list of IDs or just an ID: %s" % type(vals)
+            )
+        if timeout is None:
+            deadline = None
+        else:
+            deadline = time.monotonic() + timeout
+        max_blocking_operation_time = MAX_BLOCKING_OPERATION_TIME_S
+        if "RAY_CLIENT_MAX_BLOCKING_OPERATION_TIME_S" in os.environ:
+            max_blocking_operation_time = float(
+                os.environ["RAY_CLIENT_MAX_BLOCKING_OPERATION_TIME_S"]
+            )
+        while True:
+            if deadline:
+                op_timeout = min(
+                    max_blocking_operation_time,
+                    max(deadline - time.monotonic(), 0.001),
+                )
+            else:
+                op_timeout = max_blocking_operation_time
+            try:
+                res = self._get(to_get, op_timeout)
+                break
+            except GetTimeoutError:
+                if deadline and time.monotonic() > deadline:
+                    raise
+                logger.debug("Internal retry for get {}".format(to_get))
+        if len(to_get) != len(res):
+            raise Exception(
+                "Mismatched number of items in request ({}) and response ({})".format(
+                    len(to_get), len(res)
+                )
+            )
+        if isinstance(vals, ClientObjectRef):
+            res = res[0]
+        return res
+    def _get(self, ref: List[ClientObjectRef], timeout: float):
+        req = ray_client_pb2.GetRequest(ids=[r.id for r in ref], timeout=timeout)
+        data = bytearray()
+        try:
+            resp = self._get_object_iterator(req, metadata=self.metadata)
+            for chunk in resp:
+                if not chunk.valid:
+                    try:
+                        err = cloudpickle.loads(chunk.error)
+                    except (pickle.UnpicklingError, TypeError):
+                        logger.exception("Failed to deserialize {}".format(chunk.error))
+                        raise
+                    raise err
+                if chunk.total_size > OBJECT_TRANSFER_WARNING_SIZE and log_once(
+                    "client_object_transfer_size_warning"
+                ):
+                    size_gb = chunk.total_size / 2**30
+                    warnings.warn(
+                        "Ray Client is attempting to retrieve a "
+                        f"{size_gb:.2f} GiB object over the network, which may "
+                        "be slow. Consider serializing the object to a file "
+                        "and using S3 or rsync instead.",
+                        UserWarning,
+                        stacklevel=5,
+                    )
+                data.extend(chunk.data)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+        return loads_from_server(data)
+    def put(
+        self,
+        val,
+        *,
+        client_ref_id: bytes = None,
+        _owner: Optional[ClientActorHandle] = None,
+    ):
+        if isinstance(val, ClientObjectRef):
+            raise TypeError(
+                "Calling 'put' on an ObjectRef is not allowed "
+                "(similarly, returning an ObjectRef from a remote "
+                "function is not allowed). If you really want to "
+                "do this, you can wrap the ObjectRef in a list and "
+                "call 'put' on it (or return it)."
+            )
+        data = dumps_from_client(val, self._client_id)
+        return self._put_pickled(data, client_ref_id, _owner)
+    def _put_pickled(
+        self, data, client_ref_id: bytes, owner: Optional[ClientActorHandle] = None
+    ):
+        req = ray_client_pb2.PutRequest(data=data)
+        if client_ref_id is not None:
+            req.client_ref_id = client_ref_id
+        if owner is not None:
+            req.owner_id = owner.actor_ref.id
+        resp = self.data_client.PutObject(req)
+        if not resp.valid:
+            try:
+                raise cloudpickle.loads(resp.error)
+            except (pickle.UnpicklingError, TypeError):
+                logger.exception("Failed to deserialize {}".format(resp.error))
+                raise
+        return ClientObjectRef(resp.id)
+    # TODO(ekl) respect MAX_BLOCKING_OPERATION_TIME_S for wait too
+    def wait(
+        self,
+        object_refs: List[ClientObjectRef],
+        *,
+        num_returns: int = 1,
+        timeout: float = None,
+        fetch_local: bool = True,
+    ) -> Tuple[List[ClientObjectRef], List[ClientObjectRef]]:
+        if not isinstance(object_refs, list):
+            raise TypeError(
+                "wait() expected a list of ClientObjectRef, " f"got {type(object_refs)}"
+            )
+        for ref in object_refs:
+            if not isinstance(ref, ClientObjectRef):
+                raise TypeError(
+                    "wait() expected a list of ClientObjectRef, "
+                    f"got list containing {type(ref)}"
+                )
+        data = {
+            "object_ids": [object_ref.id for object_ref in object_refs],
+            "num_returns": num_returns,
+            "timeout": timeout if (timeout is not None) else -1,
+            "client_id": self._client_id,
+        }
+        req = ray_client_pb2.WaitRequest(**data)
+        resp = self._call_stub("WaitObject", req, metadata=self.metadata)
+        if not resp.valid:
+            # TODO(ameer): improve error/exceptions messages.
+            raise Exception("Client Wait request failed. Reference invalid?")
+        client_ready_object_ids = [
+            ClientObjectRef(ref) for ref in resp.ready_object_ids
+        ]
+        client_remaining_object_ids = [
+            ClientObjectRef(ref) for ref in resp.remaining_object_ids
+        ]
+        return (client_ready_object_ids, client_remaining_object_ids)
+    def call_remote(self, instance, *args, **kwargs) -> List[Future]:
+        task = instance._prepare_client_task()
+        # data is serialized tuple of (args, kwargs)
+        task.data = dumps_from_client((args, kwargs), self._client_id)
+        num_returns = instance._num_returns()
+        if num_returns == "dynamic":
+            num_returns = -1
+        if num_returns == "streaming":
+            raise RuntimeError(
+                'Streaming actor methods (num_returns="streaming") '
+                "are not currently supported when using Ray Client."
+            )
+        return self._call_schedule_for_task(task, num_returns)
+    def _call_schedule_for_task(
+        self, task: ray_client_pb2.ClientTask, num_returns: Optional[int]
+    ) -> List[Future]:
+        logger.debug(f"Scheduling task {task.name} {task.type} {task.payload_id}")
+        task.client_id = self._client_id
+        if num_returns is None:
+            num_returns = 1
+        num_return_refs = num_returns
+        if num_return_refs == -1:
+            num_return_refs = 1
+        id_futures = [Future() for _ in range(num_return_refs)]
+        def populate_ids(resp: Union[ray_client_pb2.DataResponse, Exception]) -> None:
+            if isinstance(resp, Exception):
+                if isinstance(resp, grpc.RpcError):
+                    resp = decode_exception(resp)
+                for future in id_futures:
+                    future.set_exception(resp)
+                return
+            ticket = resp.task_ticket
+            if not ticket.valid:
+                try:
+                    ex = cloudpickle.loads(ticket.error)
+                except (pickle.UnpicklingError, TypeError) as e_new:
+                    ex = e_new
+                for future in id_futures:
+                    future.set_exception(ex)
+                return
+            if len(ticket.return_ids) != num_return_refs:
+                exc = ValueError(
+                    f"Expected {num_return_refs} returns but received "
+                    f"{len(ticket.return_ids)}"
+                )
+                for future, raw_id in zip(id_futures, ticket.return_ids):
+                    future.set_exception(exc)
+                return
+            for future, raw_id in zip(id_futures, ticket.return_ids):
+                future.set_result(raw_id)
+        self.data_client.Schedule(task, populate_ids)
+        self.total_outbound_message_size_bytes += task.ByteSize()
+        if (
+            self.total_outbound_message_size_bytes > MESSAGE_SIZE_THRESHOLD
+            and log_once("client_communication_overhead_warning")
+        ):
+            warnings.warn(
+                "More than 10MB of messages have been created to schedule "
+                "tasks on the server. This can be slow on Ray Client due to "
+                "communication overhead over the network. If you're running "
+                "many fine-grained tasks, consider running them inside a "
+                'single remote function. See the section on "Too '
+                'fine-grained tasks" in the Ray Design Patterns document for '
+                f"more details: {DESIGN_PATTERN_FINE_GRAIN_TASKS_LINK}. If "
+                "your functions frequently use large objects, consider "
+                "storing the objects remotely with ray.put. An example of "
+                'this is shown in the "Closure capture of large / '
+                'unserializable object" section of the Ray Design Patterns '
+                "document, available here: "
+                f"{DESIGN_PATTERN_LARGE_OBJECTS_LINK}",
+                UserWarning,
+            )
+        return id_futures
+    def call_release(self, id: bytes) -> None:
+        if self.closed:
+            return
+        self.reference_count[id] -= 1
+        if self.reference_count[id] == 0:
+            self._release_server(id)
+            del self.reference_count[id]
+    def _release_server(self, id: bytes) -> None:
+        if self.data_client is not None:
+            logger.debug(f"Releasing {id.hex()}")
+            self.data_client.ReleaseObject(ray_client_pb2.ReleaseRequest(ids=[id]))
+    def call_retain(self, id: bytes) -> None:
+        logger.debug(f"Retaining {id.hex()}")
+        self.reference_count[id] += 1
+    def close(self):
+        self._in_shutdown = True
+        self.closed = True
+        self.data_client.close()
+        self.log_client.close()
+        self.server = None
+        if self.channel:
+            self.channel.close()
+            self.channel = None
+    def get_actor(
+        self, name: str, namespace: Optional[str] = None
+    ) -> ClientActorHandle:
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.NAMED_ACTOR
+        task.name = name
+        task.namespace = namespace or ""
+        # Populate task.data with empty args and kwargs
+        task.data = dumps_from_client(([], {}), self._client_id)
+        futures = self._call_schedule_for_task(task, 1)
+        assert len(futures) == 1
+        handle = ClientActorHandle(ClientActorRef(futures[0], weak_ref=True))
+        # `actor_ref.is_nil()` waits until the underlying ID is resolved.
+        # This is needed because `get_actor` is often used to check the
+        # existence of an actor.
+        if handle.actor_ref.is_nil():
+            raise ValueError(f"ActorID for {name} is empty")
+        return handle
+    def terminate_actor(self, actor: ClientActorHandle, no_restart: bool) -> None:
+        if not isinstance(actor, ClientActorHandle):
+            raise ValueError(
+                "ray.kill() only supported for actors. Got: {}.".format(type(actor))
+            )
+        term_actor = ray_client_pb2.TerminateRequest.ActorTerminate()
+        term_actor.id = actor.actor_ref.id
+        term_actor.no_restart = no_restart
+        term = ray_client_pb2.TerminateRequest(actor=term_actor)
+        term.client_id = self._client_id
+        try:
+            self.data_client.Terminate(term)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+    def terminate_task(
+        self, obj: ClientObjectRef, force: bool, recursive: bool
+    ) -> None:
+        if not isinstance(obj, ClientObjectRef):
+            raise TypeError(
+                "ray.cancel() only supported for non-actor object refs. "
+                f"Got: {type(obj)}."
+            )
+        term_object = ray_client_pb2.TerminateRequest.TaskObjectTerminate()
+        term_object.id = obj.id
+        term_object.force = force
+        term_object.recursive = recursive
+        term = ray_client_pb2.TerminateRequest(task_object=term_object)
+        term.client_id = self._client_id
+        try:
+            self.data_client.Terminate(term)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+    def get_cluster_info(
+        self,
+        req_type: ray_client_pb2.ClusterInfoType.TypeEnum,
+        timeout: Optional[float] = None,
+    ):
+        req = ray_client_pb2.ClusterInfoRequest()
+        req.type = req_type
+        resp = self.server.ClusterInfo(req, timeout=timeout, metadata=self.metadata)
+        if resp.WhichOneof("response_type") == "resource_table":
+            # translate from a proto map to a python dict
+            output_dict = {k: v for k, v in resp.resource_table.table.items()}
+            return output_dict
+        elif resp.WhichOneof("response_type") == "runtime_context":
+            return resp.runtime_context
+        return json.loads(resp.json)
+    def internal_kv_get(self, key: bytes, namespace: Optional[bytes]) -> bytes:
+        req = ray_client_pb2.KVGetRequest(key=key, namespace=namespace)
+        try:
+            resp = self._call_stub("KVGet", req, metadata=self.metadata)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+        if resp.HasField("value"):
+            return resp.value
+        # Value is None when the key does not exist in the KV.
+        return None
+    def internal_kv_exists(self, key: bytes, namespace: Optional[bytes]) -> bool:
+        req = ray_client_pb2.KVExistsRequest(key=key, namespace=namespace)
+        try:
+            resp = self._call_stub("KVExists", req, metadata=self.metadata)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+        return resp.exists
+    def internal_kv_put(
+        self, key: bytes, value: bytes, overwrite: bool, namespace: Optional[bytes]
+    ) -> bool:
+        req = ray_client_pb2.KVPutRequest(
+            key=key, value=value, overwrite=overwrite, namespace=namespace
+        )
+        metadata = self._add_ids_to_metadata(self.metadata)
+        try:
+            resp = self._call_stub("KVPut", req, metadata=metadata)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+        return resp.already_exists
+    def internal_kv_del(
+        self, key: bytes, del_by_prefix: bool, namespace: Optional[bytes]
+    ) -> int:
+        req = ray_client_pb2.KVDelRequest(
+            key=key, del_by_prefix=del_by_prefix, namespace=namespace
+        )
+        metadata = self._add_ids_to_metadata(self.metadata)
+        try:
+            resp = self._call_stub("KVDel", req, metadata=metadata)
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+        return resp.deleted_num
+    def internal_kv_list(
+        self, prefix: bytes, namespace: Optional[bytes]
+    ) -> List[bytes]:
+        try:
+            req = ray_client_pb2.KVListRequest(prefix=prefix, namespace=namespace)
+            return self._call_stub("KVList", req, metadata=self.metadata).keys
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+    def pin_runtime_env_uri(self, uri: str, expiration_s: int) -> None:
+        req = ray_client_pb2.ClientPinRuntimeEnvURIRequest(
+            uri=uri, expiration_s=expiration_s
+        )
+        self._call_stub("PinRuntimeEnvURI", req, metadata=self.metadata)
+    def list_named_actors(self, all_namespaces: bool) -> List[Dict[str, str]]:
+        req = ray_client_pb2.ClientListNamedActorsRequest(all_namespaces=all_namespaces)
+        return json.loads(self.data_client.ListNamedActors(req).actors_json)
+    def is_initialized(self) -> bool:
+        if not self.is_connected() or self.server is None:
+            return False
+        if not self._serverside_ray_initialized:
+            # We only check that Ray is initialized on the server once to
+            # avoid making an RPC every time this function is called. This is
+            # safe to do because Ray only 'un-initializes' on the server when
+            # the Client connection is torn down.
+            self._serverside_ray_initialized = self.get_cluster_info(
+                ray_client_pb2.ClusterInfoType.IS_INITIALIZED
+            )
+        return self._serverside_ray_initialized
+    def ping_server(self, timeout=None) -> bool:
+        """Simple health check.
+        Piggybacks the IS_INITIALIZED call to check if the server provides
+        an actual response.
+        """
+        if self.server is not None:
+            logger.debug("Pinging server.")
+            result = self.get_cluster_info(
+                ray_client_pb2.ClusterInfoType.PING, timeout=timeout
+            )
+            return result is not None
+        return False
+    def is_connected(self) -> bool:
+        return not self._in_shutdown and self._has_connected
+    def _server_init(
+        self, job_config: JobConfig, ray_init_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        """Initialize the server"""
+        if ray_init_kwargs is None:
+            ray_init_kwargs = {}
+        try:
+            if job_config is None:
+                serialized_job_config = None
+            else:
+                with tempfile.TemporaryDirectory() as tmp_dir:
+                    runtime_env = job_config.runtime_env or {}
+                    runtime_env = upload_py_modules_if_needed(
+                        runtime_env, tmp_dir, logger=logger
+                    )
+                    runtime_env = upload_working_dir_if_needed(
+                        runtime_env, tmp_dir, logger=logger
+                    )
+                    # Remove excludes, it isn't relevant after the upload step.
+                    runtime_env.pop("excludes", None)
+                    job_config.set_runtime_env(runtime_env, validate=True)
+                serialized_job_config = pickle.dumps(job_config)
+            response = self.data_client.Init(
+                ray_client_pb2.InitRequest(
+                    job_config=serialized_job_config,
+                    ray_init_kwargs=json.dumps(ray_init_kwargs),
+                    reconnect_grace_period=self._reconnect_grace_period,
+                )
+            )
+            if not response.ok:
+                raise ConnectionAbortedError(
+                    f"Initialization failure from server:\n{response.msg}"
+                )
+        except grpc.RpcError as e:
+            raise decode_exception(e)
+    def _convert_actor(self, actor: "ActorClass") -> str:
+        """Register a ClientActorClass for the ActorClass and return a UUID"""
+        key = uuid.uuid4().hex
+        cls = actor.__ray_metadata__.modified_class
+        self._converted[key] = ClientActorClass(cls, options=actor._default_options)
+        return key
+    def _convert_function(self, func: "RemoteFunction") -> str:
+        """Register a ClientRemoteFunc for the ActorClass and return a UUID"""
+        key = uuid.uuid4().hex
+        self._converted[key] = ClientRemoteFunc(
+            func._function, options=func._default_options
+        )
+        return key
+    def _get_converted(self, key: str) -> "ClientStub":
+        """Given a UUID, return the converted object"""
+        return self._converted[key]
+    def _converted_key_exists(self, key: str) -> bool:
+        """Check if a key UUID is present in the store of converted objects."""
+        return key in self._converted
+    def _dumps_from_client(self, val) -> bytes:
+        return dumps_from_client(val, self._client_id)
+def make_client_id() -> str:
+    id = uuid.uuid4()
+    return id.hex
+def decode_exception(e: grpc.RpcError) -> Exception:
+    if e.code() != grpc.StatusCode.ABORTED:
+        # The ABORTED status code is used by the server when an application
+        # error is serialized into the the exception details. If the code
+        # isn't ABORTED, then return the original error since there's no
+        # serialized error to decode.
+        # See server.py::return_exception_in_context for details
+        return ConnectionError(f"GRPC connection failed: {e}")
+    data = base64.standard_b64decode(e.details())
+    return loads_from_server(data)

.venv/lib/python3.11/site-packages/ray/util/dask/__init__.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import dask
+from .scheduler import (
+    ray_dask_get,
+    ray_dask_get_sync,
+    enable_dask_on_ray,
+    disable_dask_on_ray,
+)
+from .callbacks import (
+    RayDaskCallback,
+    local_ray_callbacks,
+    unpack_ray_callbacks,
+    ProgressBarCallback,
+)
+from .optimizations import dataframe_optimize
+dask_persist = dask.persist
+def ray_dask_persist(*args, **kwargs):
+    kwargs["ray_persist"] = True
+    return dask_persist(*args, **kwargs)
+ray_dask_persist.__doc__ = dask_persist.__doc__
+dask_persist_mixin = dask.base.DaskMethodsMixin.persist
+def ray_dask_persist_mixin(self, **kwargs):
+    kwargs["ray_persist"] = True
+    return dask_persist_mixin(self, **kwargs)
+ray_dask_persist_mixin.__doc__ = dask_persist_mixin.__doc__
+# We patch dask in order to inject a kwarg into its `dask.persist()` calls,
+# which the Dask-on-Ray scheduler needs.
+# FIXME(Clark): Monkey patching is bad and we should try to avoid this.
+def patch_dask(ray_dask_persist, ray_dask_persist_mixin):
+    dask.persist = ray_dask_persist
+    dask.base.DaskMethodsMixin.persist = ray_dask_persist_mixin
+patch_dask(ray_dask_persist, ray_dask_persist_mixin)
+__all__ = [
+    # Config
+    "enable_dask_on_ray",
+    "disable_dask_on_ray",
+    # Schedulers
+    "ray_dask_get",
+    "ray_dask_get_sync",
+    # Helpers
+    "ray_dask_persist",
+    # Callbacks
+    "RayDaskCallback",
+    "local_ray_callbacks",
+    "unpack_ray_callbacks",
+    # Optimizations
+    "dataframe_optimize",
+    "ProgressBarCallback",
+]

.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.68 kB). View file

.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/callbacks.cpython-311.pyc ADDED Viewed

Binary file (17.9 kB). View file

.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (4.98 kB). View file

.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/optimizations.cpython-311.pyc ADDED Viewed

Binary file (7.55 kB). View file

.venv/lib/python3.11/site-packages/ray/util/dask/__pycache__/scheduler.cpython-311.pyc ADDED Viewed

Binary file (25.1 kB). View file