koichi12 commited on Feb 12, 2025

Commit

e8a93e7

verified ·

1 Parent(s): de7cd93

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/_private/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/_private/arrow_serialization.py +816 -0
.venv/lib/python3.11/site-packages/ray/_private/async_compat.py +52 -0
.venv/lib/python3.11/site-packages/ray/_private/async_utils.py +52 -0
.venv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py +31 -0
.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py +184 -0
.venv/lib/python3.11/site-packages/ray/_private/collections_utils.py +10 -0
.venv/lib/python3.11/site-packages/ray/_private/compat.py +40 -0
.venv/lib/python3.11/site-packages/ray/_private/conftest_utils.py +14 -0
.venv/lib/python3.11/site-packages/ray/_private/dict.py +247 -0
.venv/lib/python3.11/site-packages/ray/_private/external_storage.py +707 -0
.venv/lib/python3.11/site-packages/ray/_private/function_manager.py +706 -0
.venv/lib/python3.11/site-packages/ray/_private/gcs_aio_client.py +47 -0
.venv/lib/python3.11/site-packages/ray/_private/gcs_pubsub.py +311 -0
.venv/lib/python3.11/site-packages/ray/_private/gcs_utils.py +163 -0
.venv/lib/python3.11/site-packages/ray/_private/inspect_util.py +49 -0
.venv/lib/python3.11/site-packages/ray/_private/internal_api.py +255 -0
.venv/lib/python3.11/site-packages/ray/_private/log.py +117 -0
.venv/lib/python3.11/site-packages/ray/_private/log_monitor.py +581 -0
.venv/lib/python3.11/site-packages/ray/_private/logging_utils.py +29 -0
.venv/lib/python3.11/site-packages/ray/_private/memory_monitor.py +162 -0
.venv/lib/python3.11/site-packages/ray/_private/metrics_agent.py +675 -0
.venv/lib/python3.11/site-packages/ray/_private/node.py +1862 -0
.venv/lib/python3.11/site-packages/ray/_private/parameter.py +483 -0
.venv/lib/python3.11/site-packages/ray/_private/process_watcher.py +198 -0
.venv/lib/python3.11/site-packages/ray/_private/profiling.py +240 -0
.venv/lib/python3.11/site-packages/ray/_private/prometheus_exporter.py +365 -0
.venv/lib/python3.11/site-packages/ray/_private/protobuf_compat.py +46 -0
.venv/lib/python3.11/site-packages/ray/_private/pydantic_compat.py +108 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_client_microbenchmark.py +117 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_cluster_perf.py +50 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_constants.py +554 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_experimental_perf.py +337 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_microbenchmark_helpers.py +91 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_option_utils.py +387 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_perf.py +328 -0
.venv/lib/python3.11/site-packages/ray/_private/ray_process_reaper.py +60 -0
.venv/lib/python3.11/site-packages/ray/_private/resource_spec.py +317 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/__init__.py +3 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/_clonevirtualenv.py +334 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/conda.py +407 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/conda_utils.py +278 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/constants.py +28 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/context.py +108 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/default_impl.py +11 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/dependency_utils.py +113 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/image_uri.py +195 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/java_jars.py +103 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/mpi.py +114 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/mpi_runner.py +32 -0

.venv/lib/python3.11/site-packages/ray/_private/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/_private/arrow_serialization.py ADDED Viewed

	@@ -0,0 +1,816 @@

+# arrow_serialization.py must resides outside of ray.data, otherwise
+# it causes circular dependency issues for AsyncActors due to
+# ray.data's lazy import.
+# see https://github.com/ray-project/ray/issues/30498 for more context.
+from dataclasses import dataclass
+import logging
+import os
+import sys
+from typing import List, Tuple, Optional, TYPE_CHECKING
+if TYPE_CHECKING:
+    import pyarrow
+    from ray.data.extensions import ArrowTensorArray
+RAY_DISABLE_CUSTOM_ARROW_JSON_OPTIONS_SERIALIZATION = (
+    "RAY_DISABLE_CUSTOM_ARROW_JSON_OPTIONS_SERIALIZATION"
+)
+RAY_DISABLE_CUSTOM_ARROW_DATA_SERIALIZATION = (
+    "RAY_DISABLE_CUSTOM_ARROW_DATA_SERIALIZATION"
+)
+logger = logging.getLogger(__name__)
+# Whether we have already warned the user about bloated fallback serialization.
+_serialization_fallback_set = set()
+# Whether we're currently running in a test, either local or CI.
+_in_test = None
+def _is_in_test():
+    global _in_test
+    if _in_test is None:
+        _in_test = any(
+            env_var in os.environ
+            # These environment variables are always set by pytest and Buildkite,
+            # respectively.
+            for env_var in ("PYTEST_CURRENT_TEST", "BUILDKITE")
+        )
+    return _in_test
+def _register_custom_datasets_serializers(serialization_context):
+    try:
+        import pyarrow as pa  # noqa: F401
+    except ModuleNotFoundError:
+        # No pyarrow installed so not using Arrow, so no need for custom serializers.
+        return
+    # Register all custom serializers required by Datasets.
+    _register_arrow_data_serializer(serialization_context)
+    _register_arrow_json_readoptions_serializer(serialization_context)
+    _register_arrow_json_parseoptions_serializer(serialization_context)
+# Register custom Arrow JSON ReadOptions serializer to workaround it not being picklable
+# in Arrow < 8.0.0.
+def _register_arrow_json_readoptions_serializer(serialization_context):
+    if (
+        os.environ.get(
+            RAY_DISABLE_CUSTOM_ARROW_JSON_OPTIONS_SERIALIZATION,
+            "0",
+        )
+        == "1"
+    ):
+        return
+    import pyarrow.json as pajson
+    serialization_context._register_cloudpickle_serializer(
+        pajson.ReadOptions,
+        custom_serializer=lambda opts: (opts.use_threads, opts.block_size),
+        custom_deserializer=lambda args: pajson.ReadOptions(*args),
+    )
+def _register_arrow_json_parseoptions_serializer(serialization_context):
+    if (
+        os.environ.get(
+            RAY_DISABLE_CUSTOM_ARROW_JSON_OPTIONS_SERIALIZATION,
+            "0",
+        )
+        == "1"
+    ):
+        return
+    import pyarrow.json as pajson
+    serialization_context._register_cloudpickle_serializer(
+        pajson.ParseOptions,
+        custom_serializer=lambda opts: (
+            opts.explicit_schema,
+            opts.newlines_in_values,
+            opts.unexpected_field_behavior,
+        ),
+        custom_deserializer=lambda args: pajson.ParseOptions(*args),
+    )
+# Register custom Arrow data serializer to work around zero-copy slice pickling bug.
+# See https://issues.apache.org/jira/browse/ARROW-10739.
+def _register_arrow_data_serializer(serialization_context):
+    """Custom reducer for Arrow data that works around a zero-copy slicing pickling
+    bug by using the Arrow IPC format for the underlying serialization.
+    Background:
+        Arrow has both array-level slicing and buffer-level slicing; both are zero-copy,
+        but the former has a serialization bug where the entire buffer is serialized
+        instead of just the slice, while the latter's serialization works as expected
+        and only serializes the slice of the buffer. I.e., array-level slicing doesn't
+        propagate the slice down to the buffer when serializing the array.
+        We work around this by registering a custom cloudpickle reducers for Arrow
+        Tables that delegates serialization to the Arrow IPC format; thankfully, Arrow's
+        IPC serialization has fixed this buffer truncation bug.
+    See https://issues.apache.org/jira/browse/ARROW-10739.
+    """
+    if os.environ.get(RAY_DISABLE_CUSTOM_ARROW_DATA_SERIALIZATION, "0") == "1":
+        return
+    import pyarrow as pa
+    serialization_context._register_cloudpickle_reducer(pa.Table, _arrow_table_reduce)
+def _arrow_table_reduce(t: "pyarrow.Table"):
+    """Custom reducer for Arrow Tables that works around a zero-copy slice pickling bug.
+    Background:
+        Arrow has both array-level slicing and buffer-level slicing; both are zero-copy,
+        but the former has a serialization bug where the entire buffer is serialized
+        instead of just the slice, while the latter's serialization works as expected
+        and only serializes the slice of the buffer. I.e., array-level slicing doesn't
+        propagate the slice down to the buffer when serializing the array.
+        All that these copy methods do is, at serialization time, take the array-level
+        slicing and translate them to buffer-level slicing, so only the buffer slice is
+        sent over the wire instead of the entire buffer.
+    See https://issues.apache.org/jira/browse/ARROW-10739.
+    """
+    global _serialization_fallback_set
+    # Reduce the ChunkedArray columns.
+    reduced_columns = []
+    for column_name in t.column_names:
+        column = t[column_name]
+        try:
+            # Delegate to ChunkedArray reducer.
+            reduced_column = _arrow_chunked_array_reduce(column)
+        except Exception as e:
+            if not _is_dense_union(column.type) and _is_in_test():
+                # If running in a test and the column is not a dense union array
+                # (which we expect to need a fallback), we want to raise the error,
+                # not fall back.
+                raise e from None
+            if type(column.type) not in _serialization_fallback_set:
+                logger.warning(
+                    "Failed to complete optimized serialization of Arrow Table, "
+                    f"serialization of column '{column_name}' of type {column.type} "
+                    "failed, so we're falling back to Arrow IPC serialization for the "
+                    "table. Note that this may result in slower serialization and more "
+                    "worker memory utilization. Serialization error:",
+                    exc_info=True,
+                )
+                _serialization_fallback_set.add(type(column.type))
+            # Fall back to Arrow IPC-based workaround for the entire table.
+            return _arrow_table_ipc_reduce(t)
+        else:
+            # Column reducer succeeded, add reduced column to list.
+            reduced_columns.append(reduced_column)
+    return _reconstruct_table, (reduced_columns, t.schema)
+def _reconstruct_table(
+    reduced_columns: List[Tuple[List["pyarrow.Array"], "pyarrow.DataType"]],
+    schema: "pyarrow.Schema",
+) -> "pyarrow.Table":
+    """Restore a serialized Arrow Table, reconstructing each reduced column."""
+    import pyarrow as pa
+    # Reconstruct each reduced column.
+    columns = []
+    for chunks_payload, type_ in reduced_columns:
+        columns.append(_reconstruct_chunked_array(chunks_payload, type_))
+    return pa.Table.from_arrays(columns, schema=schema)
+def _arrow_chunked_array_reduce(
+    ca: "pyarrow.ChunkedArray",
+) -> Tuple[List["PicklableArrayPayload"], "pyarrow.DataType"]:
+    """Custom reducer for Arrow ChunkedArrays that works around a zero-copy slice
+    pickling bug. This reducer does not return a reconstruction function, since it's
+    expected to be reconstructed by the Arrow Table reconstructor.
+    """
+    # Convert chunks to serialization payloads.
+    chunk_payloads = []
+    for chunk in ca.chunks:
+        chunk_payload = PicklableArrayPayload.from_array(chunk)
+        chunk_payloads.append(chunk_payload)
+    return chunk_payloads, ca.type
+def _reconstruct_chunked_array(
+    chunks: List["PicklableArrayPayload"], type_: "pyarrow.DataType"
+) -> "pyarrow.ChunkedArray":
+    """Restore a serialized Arrow ChunkedArray from chunks and type."""
+    import pyarrow as pa
+    # Reconstruct chunks from serialization payloads.
+    chunks = [chunk.to_array() for chunk in chunks]
+    return pa.chunked_array(chunks, type_)
+@dataclass
+class PicklableArrayPayload:
+    """Picklable array payload, holding data buffers and array metadata.
+    This is a helper container for pickling and reconstructing nested Arrow Arrays while
+    ensuring that the buffers that underly zero-copy slice views are properly truncated.
+    """
+    # Array type.
+    type: "pyarrow.DataType"
+    # Length of array.
+    length: int
+    # Underlying data buffers.
+    buffers: List["pyarrow.Buffer"]
+    # Cached null count.
+    null_count: int
+    # Slice offset into base array.
+    offset: int
+    # Serialized array payloads for nested (child) arrays.
+    children: List["PicklableArrayPayload"]
+    @classmethod
+    def from_array(self, a: "pyarrow.Array") -> "PicklableArrayPayload":
+        """Create a picklable array payload from an Arrow Array.
+        This will recursively accumulate data buffer and metadata payloads that are
+        ready for pickling; namely, the data buffers underlying zero-copy slice views
+        will be properly truncated.
+        """
+        return _array_to_array_payload(a)
+    def to_array(self) -> "pyarrow.Array":
+        """Reconstruct an Arrow Array from this picklable payload."""
+        return _array_payload_to_array(self)
+def _array_payload_to_array(payload: "PicklableArrayPayload") -> "pyarrow.Array":
+    """Reconstruct an Arrow Array from a possibly nested PicklableArrayPayload."""
+    import pyarrow as pa
+    from ray.air.util.tensor_extensions.arrow import get_arrow_extension_tensor_types
+    children = [child_payload.to_array() for child_payload in payload.children]
+    tensor_extension_types = get_arrow_extension_tensor_types()
+    if pa.types.is_dictionary(payload.type):
+        # Dedicated path for reconstructing a DictionaryArray, since
+        # Array.from_buffers() doesn't work for DictionaryArrays.
+        assert len(children) == 2, len(children)
+        indices, dictionary = children
+        return pa.DictionaryArray.from_arrays(indices, dictionary)
+    elif pa.types.is_map(payload.type) and len(children) > 1:
+        # In pyarrow<7.0.0, the underlying map child array is not exposed, so we work
+        # with the key and item arrays.
+        assert len(children) == 3, len(children)
+        offsets, keys, items = children
+        return pa.MapArray.from_arrays(offsets, keys, items)
+    elif isinstance(
+        payload.type,
+        tensor_extension_types,
+    ):
+        # Dedicated path for reconstructing an ArrowTensorArray or
+        # ArrowVariableShapedTensorArray, both of which can't be reconstructed by the
+        # Array.from_buffers() API.
+        assert len(children) == 1, len(children)
+        storage = children[0]
+        return pa.ExtensionArray.from_storage(payload.type, storage)
+    else:
+        # Common case: use Array.from_buffers() to construct an array of a certain type.
+        return pa.Array.from_buffers(
+            type=payload.type,
+            length=payload.length,
+            buffers=payload.buffers,
+            null_count=payload.null_count,
+            offset=payload.offset,
+            children=children,
+        )
+def _array_to_array_payload(a: "pyarrow.Array") -> "PicklableArrayPayload":
+    """Serialize an Arrow Array to an PicklableArrayPayload for later pickling.
+    This function's primary purpose is to dispatch to the handler for the input array
+    type.
+    """
+    import pyarrow as pa
+    from ray.air.util.tensor_extensions.arrow import get_arrow_extension_tensor_types
+    tensor_extension_types = get_arrow_extension_tensor_types()
+    if _is_dense_union(a.type):
+        # Dense unions are not supported.
+        # TODO(Clark): Support dense unions.
+        raise NotImplementedError(
+            "Custom slice view serialization of dense union arrays is not yet "
+            "supported."
+        )
+    # Dispatch to handler for array type.
+    if pa.types.is_null(a.type):
+        return _null_array_to_array_payload(a)
+    elif _is_primitive(a.type):
+        return _primitive_array_to_array_payload(a)
+    elif _is_binary(a.type):
+        return _binary_array_to_array_payload(a)
+    elif pa.types.is_list(a.type) or pa.types.is_large_list(a.type):
+        return _list_array_to_array_payload(a)
+    elif pa.types.is_fixed_size_list(a.type):
+        return _fixed_size_list_array_to_array_payload(a)
+    elif pa.types.is_struct(a.type):
+        return _struct_array_to_array_payload(a)
+    elif pa.types.is_union(a.type):
+        return _union_array_to_array_payload(a)
+    elif pa.types.is_dictionary(a.type):
+        return _dictionary_array_to_array_payload(a)
+    elif pa.types.is_map(a.type):
+        return _map_array_to_array_payload(a)
+    elif isinstance(a.type, tensor_extension_types):
+        return _tensor_array_to_array_payload(a)
+    elif isinstance(a.type, pa.ExtensionType):
+        return _extension_array_to_array_payload(a)
+    else:
+        raise ValueError("Unhandled Arrow array type:", a.type)
+def _is_primitive(type_: "pyarrow.DataType") -> bool:
+    """Whether the provided Array type is primitive (boolean, numeric, temporal or
+    fixed-size binary)."""
+    import pyarrow as pa
+    return (
+        pa.types.is_integer(type_)
+        or pa.types.is_floating(type_)
+        or pa.types.is_decimal(type_)
+        or pa.types.is_boolean(type_)
+        or pa.types.is_temporal(type_)
+        or pa.types.is_fixed_size_binary(type_)
+    )
+def _is_binary(type_: "pyarrow.DataType") -> bool:
+    """Whether the provided Array type is a variable-sized binary type."""
+    import pyarrow as pa
+    return (
+        pa.types.is_string(type_)
+        or pa.types.is_large_string(type_)
+        or pa.types.is_binary(type_)
+        or pa.types.is_large_binary(type_)
+    )
+def _null_array_to_array_payload(a: "pyarrow.NullArray") -> "PicklableArrayPayload":
+    """Serialize null array to PicklableArrayPayload."""
+    # Buffer scheme: [None]
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[None],  # Single null buffer is expected.
+        null_count=a.null_count,
+        offset=0,
+        children=[],
+    )
+def _primitive_array_to_array_payload(a: "pyarrow.Array") -> "PicklableArrayPayload":
+    """Serialize primitive (numeric, temporal, boolean) arrays to
+    PicklableArrayPayload.
+    """
+    assert _is_primitive(a.type), a.type
+    # Buffer scheme: [bitmap, data]
+    buffers = a.buffers()
+    assert len(buffers) == 2, len(buffers)
+    # Copy bitmap buffer, if needed.
+    bitmap_buf = buffers[0]
+    if a.null_count > 0:
+        bitmap_buf = _copy_bitpacked_buffer_if_needed(bitmap_buf, a.offset, len(a))
+    else:
+        bitmap_buf = None
+    # Copy data buffer, if needed.
+    data_buf = buffers[1]
+    if data_buf is not None:
+        data_buf = _copy_buffer_if_needed(buffers[1], a.type, a.offset, len(a))
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[bitmap_buf, data_buf],
+        null_count=a.null_count,
+        offset=0,
+        children=[],
+    )
+def _binary_array_to_array_payload(a: "pyarrow.Array") -> "PicklableArrayPayload":
+    """Serialize binary (variable-sized binary, string) arrays to
+    PicklableArrayPayload.
+    """
+    assert _is_binary(a.type), a.type
+    # Buffer scheme: [bitmap, value_offsets, data]
+    buffers = a.buffers()
+    assert len(buffers) == 3, len(buffers)
+    # Copy bitmap buffer, if needed.
+    if a.null_count > 0:
+        bitmap_buf = _copy_bitpacked_buffer_if_needed(buffers[0], a.offset, len(a))
+    else:
+        bitmap_buf = None
+    # Copy offset buffer, if needed.
+    offset_buf = buffers[1]
+    offset_buf, data_offset, data_length = _copy_offsets_buffer_if_needed(
+        offset_buf, a.type, a.offset, len(a)
+    )
+    data_buf = buffers[2]
+    data_buf = _copy_buffer_if_needed(data_buf, None, data_offset, data_length)
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[bitmap_buf, offset_buf, data_buf],
+        null_count=a.null_count,
+        offset=0,
+        children=[],
+    )
+def _list_array_to_array_payload(a: "pyarrow.Array") -> "PicklableArrayPayload":
+    """Serialize list (regular and large) arrays to PicklableArrayPayload."""
+    # Dedicated path for ListArrays. These arrays have a nested set of bitmap and
+    # offset buffers, eventually bottoming out on a data buffer.
+    # Buffer scheme:
+    # [bitmap, offsets, bitmap, offsets, ..., bitmap, data]
+    buffers = a.buffers()
+    assert len(buffers) > 1, len(buffers)
+    # Copy bitmap buffer, if needed.
+    if a.null_count > 0:
+        bitmap_buf = _copy_bitpacked_buffer_if_needed(buffers[0], a.offset, len(a))
+    else:
+        bitmap_buf = None
+    # Copy offset buffer, if needed.
+    offset_buf = buffers[1]
+    offset_buf, child_offset, child_length = _copy_offsets_buffer_if_needed(
+        offset_buf, a.type, a.offset, len(a)
+    )
+    # Propagate slice to child.
+    child = a.values.slice(child_offset, child_length)
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[bitmap_buf, offset_buf],
+        null_count=a.null_count,
+        offset=0,
+        children=[_array_to_array_payload(child)],
+    )
+def _fixed_size_list_array_to_array_payload(
+    a: "pyarrow.FixedSizeListArray",
+) -> "PicklableArrayPayload":
+    """Serialize fixed size list arrays to PicklableArrayPayload."""
+    # Dedicated path for fixed-size lists.
+    # Buffer scheme:
+    # [bitmap, values_bitmap, values_data, values_subbuffers...]
+    buffers = a.buffers()
+    assert len(buffers) >= 1, len(buffers)
+    # Copy bitmap buffer, if needed.
+    if a.null_count > 0:
+        bitmap_buf = _copy_bitpacked_buffer_if_needed(buffers[0], a.offset, len(a))
+    else:
+        bitmap_buf = None
+    # Propagate slice to child.
+    child_offset = a.type.list_size * a.offset
+    child_length = a.type.list_size * len(a)
+    child = a.values.slice(child_offset, child_length)
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[bitmap_buf],
+        null_count=a.null_count,
+        offset=0,
+        children=[_array_to_array_payload(child)],
+    )
+def _struct_array_to_array_payload(a: "pyarrow.StructArray") -> "PicklableArrayPayload":
+    """Serialize struct arrays to PicklableArrayPayload."""
+    # Dedicated path for StructArrays.
+    # StructArrays have a top-level bitmap buffer and one or more children arrays.
+    # Buffer scheme: [bitmap, None, child_bitmap, child_data, ...]
+    buffers = a.buffers()
+    assert len(buffers) >= 1, len(buffers)
+    # Copy bitmap buffer, if needed.
+    if a.null_count > 0:
+        bitmap_buf = _copy_bitpacked_buffer_if_needed(buffers[0], a.offset, len(a))
+    else:
+        bitmap_buf = None
+    # Get field children payload.
+    # Offsets and truncations are already propagated to the field arrays, so we can
+    # serialize them as-is.
+    children = [_array_to_array_payload(a.field(i)) for i in range(a.type.num_fields)]
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[bitmap_buf],
+        null_count=a.null_count,
+        offset=0,
+        children=children,
+    )
+def _union_array_to_array_payload(a: "pyarrow.UnionArray") -> "PicklableArrayPayload":
+    """Serialize union arrays to PicklableArrayPayload."""
+    import pyarrow as pa
+    # Dedicated path for UnionArrays.
+    # UnionArrays have a top-level bitmap buffer and type code buffer, and one or
+    # more children arrays.
+    # Buffer scheme: [None, typecodes, child_bitmap, child_data, ...]
+    assert not _is_dense_union(a.type)
+    buffers = a.buffers()
+    assert len(buffers) > 1, len(buffers)
+    bitmap_buf = buffers[0]
+    assert bitmap_buf is None, bitmap_buf
+    # Copy type code buffer, if needed.
+    type_code_buf = buffers[1]
+    type_code_buf = _copy_buffer_if_needed(type_code_buf, pa.int8(), a.offset, len(a))
+    # Get field children payload.
+    # Offsets and truncations are already propagated to the field arrays, so we can
+    # serialize them as-is.
+    children = [_array_to_array_payload(a.field(i)) for i in range(a.type.num_fields)]
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[bitmap_buf, type_code_buf],
+        null_count=a.null_count,
+        offset=0,
+        children=children,
+    )
+def _dictionary_array_to_array_payload(
+    a: "pyarrow.DictionaryArray",
+) -> "PicklableArrayPayload":
+    """Serialize dictionary arrays to PicklableArrayPayload."""
+    # Dedicated path for DictionaryArrays.
+    # Buffer scheme: [indices_bitmap, indices_data] (dictionary stored separately)
+    indices_payload = _array_to_array_payload(a.indices)
+    dictionary_payload = _array_to_array_payload(a.dictionary)
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[],
+        null_count=a.null_count,
+        offset=0,
+        children=[indices_payload, dictionary_payload],
+    )
+def _map_array_to_array_payload(a: "pyarrow.MapArray") -> "PicklableArrayPayload":
+    """Serialize map arrays to PicklableArrayPayload."""
+    import pyarrow as pa
+    # Dedicated path for MapArrays.
+    # Buffer scheme: [bitmap, offsets, child_struct_array_buffers, ...]
+    buffers = a.buffers()
+    assert len(buffers) > 0, len(buffers)
+    # Copy bitmap buffer, if needed.
+    if a.null_count > 0:
+        bitmap_buf = _copy_bitpacked_buffer_if_needed(buffers[0], a.offset, len(a))
+    else:
+        bitmap_buf = None
+    new_buffers = [bitmap_buf]
+    # Copy offsets buffer, if needed.
+    offset_buf = buffers[1]
+    offset_buf, data_offset, data_length = _copy_offsets_buffer_if_needed(
+        offset_buf, a.type, a.offset, len(a)
+    )
+    if isinstance(a, pa.lib.ListArray):
+        # Map arrays directly expose the one child struct array in pyarrow>=7.0.0, which
+        # is easier to work with than the raw buffers.
+        new_buffers.append(offset_buf)
+        children = [_array_to_array_payload(a.values.slice(data_offset, data_length))]
+    else:
+        # In pyarrow<7.0.0, the child struct array is not exposed, so we work with the
+        # key and item arrays.
+        buffers = a.buffers()
+        assert len(buffers) > 2, len(buffers)
+        # Reconstruct offsets array.
+        offsets = pa.Array.from_buffers(
+            pa.int32(), len(a) + 1, [bitmap_buf, offset_buf]
+        )
+        # Propagate slice to keys.
+        keys = a.keys.slice(data_offset, data_length)
+        # Propagate slice to items.
+        items = a.items.slice(data_offset, data_length)
+        children = [
+            _array_to_array_payload(offsets),
+            _array_to_array_payload(keys),
+            _array_to_array_payload(items),
+        ]
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=new_buffers,
+        null_count=a.null_count,
+        offset=0,
+        children=children,
+    )
+def _tensor_array_to_array_payload(a: "ArrowTensorArray") -> "PicklableArrayPayload":
+    """Serialize tensor arrays to PicklableArrayPayload."""
+    # Offset is propagated to storage array, and the storage array items align with the
+    # tensor elements, so we only need to do the straightforward creation of the storage
+    # array payload.
+    storage_payload = _array_to_array_payload(a.storage)
+    return PicklableArrayPayload(
+        type=a.type,
+        length=len(a),
+        buffers=[],
+        null_count=a.null_count,
+        offset=0,
+        children=[storage_payload],
+    )
+def _extension_array_to_array_payload(
+    a: "pyarrow.ExtensionArray",
+) -> "PicklableArrayPayload":
+    payload = _array_to_array_payload(a.storage)
+    payload.type = a.type
+    payload.length = len(a)
+    payload.null_count = a.null_count
+    return payload
+def _copy_buffer_if_needed(
+    buf: "pyarrow.Buffer",
+    type_: Optional["pyarrow.DataType"],
+    offset: int,
+    length: int,
+) -> "pyarrow.Buffer":
+    """Copy buffer, if needed."""
+    import pyarrow as pa
+    if type_ is not None and pa.types.is_boolean(type_):
+        # Arrow boolean array buffers are bit-packed, with 8 entries per byte,
+        # and are accessed via bit offsets.
+        buf = _copy_bitpacked_buffer_if_needed(buf, offset, length)
+    else:
+        type_bytewidth = type_.bit_width // 8 if type_ is not None else 1
+        buf = _copy_normal_buffer_if_needed(buf, type_bytewidth, offset, length)
+    return buf
+def _copy_normal_buffer_if_needed(
+    buf: "pyarrow.Buffer",
+    byte_width: int,
+    offset: int,
+    length: int,
+) -> "pyarrow.Buffer":
+    """Copy buffer, if needed."""
+    byte_offset = offset * byte_width
+    byte_length = length * byte_width
+    if offset > 0 or byte_length < buf.size:
+        # Array is a zero-copy slice, so we need to copy to a new buffer before
+        # serializing; this slice of the underlying buffer (not the array) will ensure
+        # that the buffer is properly copied at pickle-time.
+        buf = buf.slice(byte_offset, byte_length)
+    return buf
+def _copy_bitpacked_buffer_if_needed(
+    buf: "pyarrow.Buffer",
+    offset: int,
+    length: int,
+) -> "pyarrow.Buffer":
+    """Copy bit-packed binary buffer, if needed."""
+    bit_offset = offset % 8
+    byte_offset = offset // 8
+    byte_length = _bytes_for_bits(bit_offset + length) // 8
+    if offset > 0 or byte_length < buf.size:
+        buf = buf.slice(byte_offset, byte_length)
+        if bit_offset != 0:
+            # Need to manually shift the buffer to eliminate the bit offset.
+            buf = _align_bit_offset(buf, bit_offset, byte_length)
+    return buf
+def _copy_offsets_buffer_if_needed(
+    buf: "pyarrow.Buffer",
+    arr_type: "pyarrow.DataType",
+    offset: int,
+    length: int,
+) -> Tuple["pyarrow.Buffer", int, int]:
+    """Copy the provided offsets buffer, returning the copied buffer and the
+    offset + length of the underlying data.
+    """
+    import pyarrow as pa
+    import pyarrow.compute as pac
+    if (
+        pa.types.is_large_list(arr_type)
+        or pa.types.is_large_string(arr_type)
+        or pa.types.is_large_binary(arr_type)
+        or pa.types.is_large_unicode(arr_type)
+    ):
+        offset_type = pa.int64()
+    else:
+        offset_type = pa.int32()
+    # Copy offset buffer, if needed.
+    buf = _copy_buffer_if_needed(buf, offset_type, offset, length + 1)
+    # Reconstruct the offset array so we can determine the offset and length
+    # of the child array.
+    offsets = pa.Array.from_buffers(offset_type, length + 1, [None, buf])
+    child_offset = offsets[0].as_py()
+    child_length = offsets[-1].as_py() - child_offset
+    # Create new offsets aligned to 0 for the copied data buffer slice.
+    offsets = pac.subtract(offsets, child_offset)
+    if pa.types.is_int32(offset_type):
+        # We need to cast the resulting Int64Array back down to an Int32Array.
+        offsets = offsets.cast(offset_type, safe=False)
+    buf = offsets.buffers()[1]
+    return buf, child_offset, child_length
+def _bytes_for_bits(n: int) -> int:
+    """Round up n to the nearest multiple of 8.
+    This is used to get the byte-padded number of bits for n bits.
+    """
+    return (n + 7) & (-8)
+def _align_bit_offset(
+    buf: "pyarrow.Buffer",
+    bit_offset: int,
+    byte_length: int,
+) -> "pyarrow.Buffer":
+    """Align the bit offset into the buffer with the front of the buffer by shifting
+    the buffer and eliminating the offset.
+    """
+    import pyarrow as pa
+    bytes_ = buf.to_pybytes()
+    bytes_as_int = int.from_bytes(bytes_, sys.byteorder)
+    bytes_as_int >>= bit_offset
+    bytes_ = bytes_as_int.to_bytes(byte_length, sys.byteorder)
+    return pa.py_buffer(bytes_)
+def _arrow_table_ipc_reduce(table: "pyarrow.Table"):
+    """Custom reducer for Arrow Table that works around a zero-copy slicing pickling
+    bug by using the Arrow IPC format for the underlying serialization.
+    This is currently used as a fallback for unsupported types (or unknown bugs) for
+    the manual buffer truncation workaround, e.g. for dense unions.
+    """
+    from pyarrow.ipc import RecordBatchStreamWriter
+    from pyarrow.lib import BufferOutputStream
+    output_stream = BufferOutputStream()
+    with RecordBatchStreamWriter(output_stream, schema=table.schema) as wr:
+        wr.write_table(table)
+    # NOTE: output_stream.getvalue() materializes the serialized table to a single
+    # contiguous bytestring, resulting in a few copy. This adds 1-2 extra copies on the
+    # serialization side, and 1 extra copy on the deserialization side.
+    return _restore_table_from_ipc, (output_stream.getvalue(),)
+def _restore_table_from_ipc(buf: bytes) -> "pyarrow.Table":
+    """Restore an Arrow Table serialized to Arrow IPC format."""
+    from pyarrow.ipc import RecordBatchStreamReader
+    with RecordBatchStreamReader(buf) as reader:
+        return reader.read_all()
+def _is_dense_union(type_: "pyarrow.DataType") -> bool:
+    """Whether the provided Arrow type is a dense union."""
+    import pyarrow as pa
+    return pa.types.is_union(type_) and type_.mode == "dense"

.venv/lib/python3.11/site-packages/ray/_private/async_compat.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+This file should only be imported from Python 3.
+It will raise SyntaxError when importing from Python 2.
+"""
+import asyncio
+import inspect
+from functools import lru_cache
+try:
+    import uvloop
+except ImportError:
+    uvloop = None
+def get_new_event_loop():
+    """Construct a new event loop. Ray will use uvloop if it exists"""
+    if uvloop:
+        return uvloop.new_event_loop()
+    else:
+        return asyncio.new_event_loop()
+def try_install_uvloop():
+    """Installs uvloop as event-loop implementation for asyncio (if available)"""
+    if uvloop:
+        uvloop.install()
+    else:
+        pass
+def is_async_func(func) -> bool:
+    """Return True if the function is an async or async generator method."""
+    return inspect.iscoroutinefunction(func) or inspect.isasyncgenfunction(func)
+@lru_cache(maxsize=2**10)
+def has_async_methods(cls: object) -> bool:
+    """Return True if the class has any async methods."""
+    return len(inspect.getmembers(cls, predicate=is_async_func)) > 0
+@lru_cache(maxsize=2**10)
+def sync_to_async(func):
+    """Wrap a blocking function in an async function"""
+    if is_async_func(func):
+        return func
+    async def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper

.venv/lib/python3.11/site-packages/ray/_private/async_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Adapted from [aiodebug](https://gitlab.com/quantlane/libs/aiodebug)
+# Copyright 2016-2022 Quantlane s.r.o.
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#        http://www.apache.org/licenses/LICENSE-2.0
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# Modifications:
+# - Removed the dependency to `logwood`.
+# - Renamed `monitor_loop_lag.enable()` to just `enable_monitor_loop_lag()`.
+# - Miscellaneous changes to make it work with Ray.
+from typing import Callable, Optional
+import asyncio
+import asyncio.events
+def enable_monitor_loop_lag(
+    callback: Callable[[float], None],
+    interval_s: float = 0.25,
+    loop: Optional[asyncio.AbstractEventLoop] = None,
+) -> None:
+    """
+    Start logging event loop lags to the callback. In ideal circumstances they should be
+    very close to zero. Lags may increase if event loop callbacks block for too long.
+    Note: this works for all event loops, including uvloop.
+    :param callback: Callback to call with the lag in seconds.
+    """
+    if loop is None:
+        loop = asyncio.get_running_loop()
+    if loop is None:
+        raise ValueError("No provided loop, nor running loop found.")
+    async def monitor():
+        while loop.is_running():
+            t0 = loop.time()
+            await asyncio.sleep(interval_s)
+            lag = loop.time() - t0 - interval_s  # Should be close to zero.
+            callback(lag)
+    loop.create_task(monitor(), name="async_utils.monitor_loop_lag")

.venv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import ray
+import os
+from functools import wraps
+import threading
+auto_init_lock = threading.Lock()
+enable_auto_connect = os.environ.get("RAY_ENABLE_AUTO_CONNECT", "") != "0"
+def auto_init_ray():
+    if enable_auto_connect and not ray.is_initialized():
+        with auto_init_lock:
+            if not ray.is_initialized():
+                ray.init()
+def wrap_auto_init(fn):
+    @wraps(fn)
+    def auto_init_wrapper(*args, **kwargs):
+        auto_init_ray()
+        return fn(*args, **kwargs)
+    return auto_init_wrapper
+def wrap_auto_init_for_all_apis(api_names):
+    """Wrap public APIs with automatic ray.init."""
+    for api_name in api_names:
+        api = getattr(ray, api_name, None)
+        assert api is not None, api_name
+        setattr(ray, api_name, wrap_auto_init(api))

.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import threading
+from contextlib import contextmanager
+from functools import wraps
+from ray._private.auto_init_hook import auto_init_ray
+# Attr set on func defs to mark they have been converted to client mode.
+RAY_CLIENT_MODE_ATTR = "__ray_client_mode_key__"
+# Global setting of whether client mode is enabled. This default to OFF,
+# but is enabled upon ray.client(...).connect() or in tests.
+is_client_mode_enabled = os.environ.get("RAY_CLIENT_MODE", "0") == "1"
+# When RAY_CLIENT_MODE == 1, we treat it as default enabled client mode
+# This is useful for testing
+is_client_mode_enabled_by_default = is_client_mode_enabled
+os.environ.update({"RAY_CLIENT_MODE": "0"})
+is_init_called = False
+# Local setting of whether to ignore client hook conversion. This defaults
+# to TRUE and is disabled when the underlying 'real' Ray function is needed.
+_client_hook_status_on_thread = threading.local()
+_client_hook_status_on_thread.status = True
+def _get_client_hook_status_on_thread():
+    """Get's the value of `_client_hook_status_on_thread`.
+    Since `_client_hook_status_on_thread` is a thread-local variable, we may
+    need to add and set the 'status' attribute.
+    """
+    global _client_hook_status_on_thread
+    if not hasattr(_client_hook_status_on_thread, "status"):
+        _client_hook_status_on_thread.status = True
+    return _client_hook_status_on_thread.status
+def _set_client_hook_status(val: bool):
+    global _client_hook_status_on_thread
+    _client_hook_status_on_thread.status = val
+def _disable_client_hook():
+    global _client_hook_status_on_thread
+    out = _get_client_hook_status_on_thread()
+    _client_hook_status_on_thread.status = False
+    return out
+def _explicitly_enable_client_mode():
+    """Force client mode to be enabled.
+    NOTE: This should not be used in tests, use `enable_client_mode`.
+    """
+    global is_client_mode_enabled
+    is_client_mode_enabled = True
+def _explicitly_disable_client_mode():
+    global is_client_mode_enabled
+    is_client_mode_enabled = False
+@contextmanager
+def disable_client_hook():
+    val = _disable_client_hook()
+    try:
+        yield None
+    finally:
+        _set_client_hook_status(val)
+@contextmanager
+def enable_client_mode():
+    _explicitly_enable_client_mode()
+    try:
+        yield None
+    finally:
+        _explicitly_disable_client_mode()
+def client_mode_hook(func: callable):
+    """Decorator for whether to use the 'regular' ray version of a function,
+    or the Ray Client version of that function.
+    Args:
+        func: This function. This is set when this function is used
+            as a decorator.
+    """
+    from ray.util.client import ray
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # NOTE(hchen): DO NOT use "import" inside this function.
+        # Because when it's called within a `__del__` method, this error
+        # will be raised (see #35114):
+        # ImportError: sys.meta_path is None, Python is likely shutting down.
+        if client_mode_should_convert():
+            # Legacy code
+            # we only convert init function if RAY_CLIENT_MODE=1
+            if func.__name__ != "init" or is_client_mode_enabled_by_default:
+                return getattr(ray, func.__name__)(*args, **kwargs)
+        return func(*args, **kwargs)
+    return wrapper
+def client_mode_should_convert():
+    """Determines if functions should be converted to client mode."""
+    # `is_client_mode_enabled_by_default` is used for testing with
+    # `RAY_CLIENT_MODE=1`. This flag means all tests run with client mode.
+    return (
+        is_client_mode_enabled or is_client_mode_enabled_by_default
+    ) and _get_client_hook_status_on_thread()
+def client_mode_wrap(func):
+    """Wraps a function called during client mode for execution as a remote
+    task.
+    Can be used to implement public features of ray client which do not
+    belong in the main ray API (`ray.*`), yet require server-side execution.
+    An example is the creation of placement groups:
+    `ray.util.placement_group.placement_group()`. When called on the client
+    side, this function is wrapped in a task to facilitate interaction with
+    the GCS.
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        from ray.util.client import ray
+        auto_init_ray()
+        # Directly pass this through since `client_mode_wrap` is for
+        # Placement Group APIs
+        if client_mode_should_convert():
+            f = ray.remote(num_cpus=0)(func)
+            ref = f.remote(*args, **kwargs)
+            return ray.get(ref)
+        return func(*args, **kwargs)
+    return wrapper
+def client_mode_convert_function(func_cls, in_args, in_kwargs, **kwargs):
+    """Runs a preregistered ray RemoteFunction through the ray client.
+    The common case for this is to transparently convert that RemoteFunction
+    to a ClientRemoteFunction. This happens in circumstances where the
+    RemoteFunction is declared early, in a library and only then is Ray used in
+    client mode -- necessitating a conversion.
+    """
+    from ray.util.client import ray
+    key = getattr(func_cls, RAY_CLIENT_MODE_ATTR, None)
+    # Second part of "or" is needed in case func_cls is reused between Ray
+    # client sessions in one Python interpreter session.
+    if (key is None) or (not ray._converted_key_exists(key)):
+        key = ray._convert_function(func_cls)
+        setattr(func_cls, RAY_CLIENT_MODE_ATTR, key)
+    client_func = ray._get_converted(key)
+    return client_func._remote(in_args, in_kwargs, **kwargs)
+def client_mode_convert_actor(actor_cls, in_args, in_kwargs, **kwargs):
+    """Runs a preregistered actor class on the ray client
+    The common case for this decorator is for instantiating an ActorClass
+    transparently as a ClientActorClass. This happens in circumstances where
+    the ActorClass is declared early, in a library and only then is Ray used in
+    client mode -- necessitating a conversion.
+    """
+    from ray.util.client import ray
+    key = getattr(actor_cls, RAY_CLIENT_MODE_ATTR, None)
+    # Second part of "or" is needed in case actor_cls is reused between Ray
+    # client sessions in one Python interpreter session.
+    if (key is None) or (not ray._converted_key_exists(key)):
+        key = ray._convert_actor(actor_cls)
+        setattr(actor_cls, RAY_CLIENT_MODE_ATTR, key)
+    client_actor = ray._get_converted(key)
+    return client_actor._remote(in_args, in_kwargs, **kwargs)

.venv/lib/python3.11/site-packages/ray/_private/collections_utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from typing import List, Any
+def split(items: List[Any], chunk_size: int):
+    """Splits provided list into chunks of given size"""
+    assert chunk_size > 0, "Chunk size has to be > 0"
+    for i in range(0, len(items), chunk_size):
+        yield items[i : i + chunk_size]

.venv/lib/python3.11/site-packages/ray/_private/compat.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import io
+import platform
+def patch_psutil():
+    """WSL's /proc/meminfo has an inconsistency where it
+    nondeterministically omits a space after colons (after "SwapFree:"
+    in my case).
+    psutil then splits on spaces and then parses the wrong field,
+    crashing on the 'int(fields[1])' expression in
+    psutil._pslinux.virtual_memory().
+    Workaround: We ensure there is a space following each colon.
+    """
+    assert (
+        platform.system() == "Linux"
+        and "Microsoft".lower() in platform.release().lower()
+    )
+    try:
+        import psutil._pslinux
+    except ImportError:
+        psutil = None
+    psutil_open_binary = None
+    if psutil:
+        try:
+            psutil_open_binary = psutil._pslinux.open_binary
+        except AttributeError:
+            pass
+    # Only patch it if it doesn't seem to have been patched already
+    if psutil_open_binary and psutil_open_binary.__name__ == "open_binary":
+        def psutil_open_binary_patched(fname, *args, **kwargs):
+            f = psutil_open_binary(fname, *args, **kwargs)
+            if fname == "/proc/meminfo":
+                with f:
+                    # Make sure there's a space after colons
+                    return io.BytesIO(f.read().replace(b":", b": "))
+            return f
+        psutil._pslinux.open_binary = psutil_open_binary_patched

.venv/lib/python3.11/site-packages/ray/_private/conftest_utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pytest
+import ray._private.ray_constants as ray_constants
+@pytest.fixture
+def set_override_dashboard_url(monkeypatch, request):
+    override_url = getattr(request, "param", "https://external_dashboard_url")
+    with monkeypatch.context() as m:
+        if override_url:
+            m.setenv(
+                ray_constants.RAY_OVERRIDE_DASHBOARD_URL,
+                override_url,
+            )
+        yield

.venv/lib/python3.11/site-packages/ray/_private/dict.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import copy
+from collections import deque
+from collections.abc import Mapping, Sequence
+from typing import Dict, List, Optional, TypeVar, Union
+from ray.util.annotations import Deprecated
+T = TypeVar("T")
+@Deprecated
+def merge_dicts(d1: dict, d2: dict) -> dict:
+    """
+    Args:
+        d1 (dict): Dict 1.
+        d2 (dict): Dict 2.
+    Returns:
+         dict: A new dict that is d1 and d2 deep merged.
+    """
+    merged = copy.deepcopy(d1)
+    deep_update(merged, d2, True, [])
+    return merged
+@Deprecated
+def deep_update(
+    original: dict,
+    new_dict: dict,
+    new_keys_allowed: bool = False,
+    allow_new_subkey_list: Optional[List[str]] = None,
+    override_all_if_type_changes: Optional[List[str]] = None,
+    override_all_key_list: Optional[List[str]] = None,
+) -> dict:
+    """Updates original dict with values from new_dict recursively.
+    If new key is introduced in new_dict, then if new_keys_allowed is not
+    True, an error will be thrown. Further, for sub-dicts, if the key is
+    in the allow_new_subkey_list, then new subkeys can be introduced.
+    Args:
+        original: Dictionary with default values.
+        new_dict: Dictionary with values to be updated
+        new_keys_allowed: Whether new keys are allowed.
+        allow_new_subkey_list: List of keys that
+            correspond to dict values where new subkeys can be introduced.
+            This is only at the top level.
+        override_all_if_type_changes: List of top level
+            keys with value=dict, for which we always simply override the
+            entire value (dict), iff the "type" key in that value dict changes.
+        override_all_key_list: List of top level keys
+            for which we override the entire value if the key is in the new_dict.
+    """
+    allow_new_subkey_list = allow_new_subkey_list or []
+    override_all_if_type_changes = override_all_if_type_changes or []
+    override_all_key_list = override_all_key_list or []
+    for k, value in new_dict.items():
+        if k not in original and not new_keys_allowed:
+            raise Exception("Unknown config parameter `{}` ".format(k))
+        # Both orginal value and new one are dicts.
+        if (
+            isinstance(original.get(k), dict)
+            and isinstance(value, dict)
+            and k not in override_all_key_list
+        ):
+            # Check old type vs old one. If different, override entire value.
+            if (
+                k in override_all_if_type_changes
+                and "type" in value
+                and "type" in original[k]
+                and value["type"] != original[k]["type"]
+            ):
+                original[k] = value
+            # Allowed key -> ok to add new subkeys.
+            elif k in allow_new_subkey_list:
+                deep_update(
+                    original[k],
+                    value,
+                    True,
+                    override_all_key_list=override_all_key_list,
+                )
+            # Non-allowed key.
+            else:
+                deep_update(
+                    original[k],
+                    value,
+                    new_keys_allowed,
+                    override_all_key_list=override_all_key_list,
+                )
+        # Original value not a dict OR new value not a dict:
+        # Override entire value.
+        else:
+            original[k] = value
+    return original
+@Deprecated
+def flatten_dict(
+    dt: Dict,
+    delimiter: str = "/",
+    prevent_delimiter: bool = False,
+    flatten_list: bool = False,
+):
+    """Flatten dict.
+    Output and input are of the same dict type.
+    Input dict remains the same after the operation.
+    """
+    def _raise_delimiter_exception():
+        raise ValueError(
+            f"Found delimiter `{delimiter}` in key when trying to flatten "
+            f"array. Please avoid using the delimiter in your specification."
+        )
+    dt = copy.copy(dt)
+    if prevent_delimiter and any(delimiter in key for key in dt):
+        # Raise if delimiter is any of the keys
+        _raise_delimiter_exception()
+    while_check = (dict, list) if flatten_list else dict
+    while any(isinstance(v, while_check) for v in dt.values()):
+        remove = []
+        add = {}
+        for key, value in dt.items():
+            if isinstance(value, dict):
+                for subkey, v in value.items():
+                    if prevent_delimiter and delimiter in subkey:
+                        # Raise if delimiter is in any of the subkeys
+                        _raise_delimiter_exception()
+                    add[delimiter.join([key, str(subkey)])] = v
+                remove.append(key)
+            elif flatten_list and isinstance(value, list):
+                for i, v in enumerate(value):
+                    if prevent_delimiter and delimiter in subkey:
+                        # Raise if delimiter is in any of the subkeys
+                        _raise_delimiter_exception()
+                    add[delimiter.join([key, str(i)])] = v
+                remove.append(key)
+        dt.update(add)
+        for k in remove:
+            del dt[k]
+    return dt
+@Deprecated
+def unflatten_dict(dt: Dict[str, T], delimiter: str = "/") -> Dict[str, T]:
+    """Unflatten dict. Does not support unflattening lists."""
+    dict_type = type(dt)
+    out = dict_type()
+    for key, val in dt.items():
+        path = key.split(delimiter)
+        item = out
+        for k in path[:-1]:
+            item = item.setdefault(k, dict_type())
+            if not isinstance(item, dict_type):
+                raise TypeError(
+                    f"Cannot unflatten dict due the key '{key}' "
+                    f"having a parent key '{k}', which value is not "
+                    f"of type {dict_type} (got {type(item)}). "
+                    "Change the key names to resolve the conflict."
+                )
+        item[path[-1]] = val
+    return out
+@Deprecated
+def unflatten_list_dict(dt: Dict[str, T], delimiter: str = "/") -> Dict[str, T]:
+    """Unflatten nested dict and list.
+    This function now has some limitations:
+    (1) The keys of dt must be str.
+    (2) If unflattened dt (the result) contains list, the index order must be
+        ascending when accessing dt. Otherwise, this function will throw
+        AssertionError.
+    (3) The unflattened dt (the result) shouldn't contain dict with number
+        keys.
+    Be careful to use this function. If you want to improve this function,
+    please also improve the unit test. See #14487 for more details.
+    Args:
+        dt: Flattened dictionary that is originally nested by multiple
+            list and dict.
+        delimiter: Delimiter of keys.
+    Example:
+        >>> dt = {"aaa/0/bb": 12, "aaa/1/cc": 56, "aaa/1/dd": 92}
+        >>> unflatten_list_dict(dt)
+        {'aaa': [{'bb': 12}, {'cc': 56, 'dd': 92}]}
+    """
+    out_type = list if list(dt)[0].split(delimiter, 1)[0].isdigit() else type(dt)
+    out = out_type()
+    for key, val in dt.items():
+        path = key.split(delimiter)
+        item = out
+        for i, k in enumerate(path[:-1]):
+            next_type = list if path[i + 1].isdigit() else dict
+            if isinstance(item, dict):
+                item = item.setdefault(k, next_type())
+            elif isinstance(item, list):
+                if int(k) >= len(item):
+                    item.append(next_type())
+                    assert int(k) == len(item) - 1
+                item = item[int(k)]
+        if isinstance(item, dict):
+            item[path[-1]] = val
+        elif isinstance(item, list):
+            item.append(val)
+            assert int(path[-1]) == len(item) - 1
+    return out
+@Deprecated
+def unflattened_lookup(
+    flat_key: str, lookup: Union[Mapping, Sequence], delimiter: str = "/", **kwargs
+) -> Union[Mapping, Sequence]:
+    """
+    Unflatten `flat_key` and iteratively look up in `lookup`. E.g.
+    `flat_key="a/0/b"` will try to return `lookup["a"][0]["b"]`.
+    """
+    if flat_key in lookup:
+        return lookup[flat_key]
+    keys = deque(flat_key.split(delimiter))
+    base = lookup
+    while keys:
+        key = keys.popleft()
+        try:
+            if isinstance(base, Mapping):
+                base = base[key]
+            elif isinstance(base, Sequence):
+                base = base[int(key)]
+            else:
+                raise KeyError()
+        except KeyError as e:
+            if "default" in kwargs:
+                return kwargs["default"]
+            raise e
+    return base

.venv/lib/python3.11/site-packages/ray/_private/external_storage.py ADDED Viewed

	@@ -0,0 +1,707 @@

+import abc
+import logging
+import os
+import random
+import shutil
+import time
+import urllib
+import uuid
+from collections import namedtuple
+from typing import IO, List, Optional, Tuple, Union
+import ray
+from ray._private.ray_constants import DEFAULT_OBJECT_PREFIX
+from ray._raylet import ObjectRef
+ParsedURL = namedtuple("ParsedURL", "base_url, offset, size")
+logger = logging.getLogger(__name__)
+def create_url_with_offset(*, url: str, offset: int, size: int) -> str:
+    """Methods to create a URL with offset.
+    When ray spills objects, it fuses multiple objects
+    into one file to optimize the performance. That says, each object
+    needs to keep tracking of its own special url to store metadata.
+    This method creates an url_with_offset, which is used internally
+    by Ray.
+    Created url_with_offset can be passed to the self._get_base_url method
+    to parse the filename used to store files.
+    Example) file://path/to/file?offset=""&size=""
+    Args:
+        url: url to the object stored in the external storage.
+        offset: Offset from the beginning of the file to
+            the first bytes of this object.
+        size: Size of the object that is stored in the url.
+            It is used to calculate the last offset.
+    Returns:
+        url_with_offset stored internally to find
+        objects from external storage.
+    """
+    return f"{url}?offset={offset}&size={size}"
+def parse_url_with_offset(url_with_offset: str) -> Tuple[str, int, int]:
+    """Parse url_with_offset to retrieve information.
+    base_url is the url where the object ref
+    is stored in the external storage.
+    Args:
+        url_with_offset: url created by create_url_with_offset.
+    Returns:
+        named tuple of base_url, offset, and size.
+    """
+    parsed_result = urllib.parse.urlparse(url_with_offset)
+    query_dict = urllib.parse.parse_qs(parsed_result.query)
+    # Split by ? to remove the query from the url.
+    base_url = parsed_result.geturl().split("?")[0]
+    if "offset" not in query_dict or "size" not in query_dict:
+        raise ValueError(f"Failed to parse URL: {url_with_offset}")
+    offset = int(query_dict["offset"][0])
+    size = int(query_dict["size"][0])
+    return ParsedURL(base_url=base_url, offset=offset, size=size)
+class ExternalStorage(metaclass=abc.ABCMeta):
+    """The base class for external storage.
+    This class provides some useful functions for zero-copy object
+    put/get from plasma store. Also it specifies the interface for
+    object spilling.
+    When inheriting this class, please make sure to implement validation
+    logic inside __init__ method. When ray instance starts, it will
+    instantiating external storage to validate the config.
+    Raises:
+        ValueError: when given configuration for
+            the external storage is invalid.
+    """
+    HEADER_LENGTH = 24
+    def _get_objects_from_store(self, object_refs):
+        worker = ray._private.worker.global_worker
+        # Since the object should always exist in the plasma store before
+        # spilling, it can directly get the object from the local plasma
+        # store.
+        # issue: https://github.com/ray-project/ray/pull/13831
+        ray_object_pairs = worker.core_worker.get_if_local(object_refs)
+        return ray_object_pairs
+    def _put_object_to_store(
+        self, metadata, data_size, file_like, object_ref, owner_address
+    ):
+        worker = ray._private.worker.global_worker
+        worker.core_worker.put_file_like_object(
+            metadata, data_size, file_like, object_ref, owner_address
+        )
+    def _write_multiple_objects(
+        self, f: IO, object_refs: List[ObjectRef], owner_addresses: List[str], url: str
+    ) -> List[str]:
+        """Fuse all given objects into a given file handle.
+        Args:
+            f: File handle to fusion all given object refs.
+            object_refs: Object references to fusion to a single file.
+            owner_addresses: Owner addresses for the provided objects.
+            url: url where the object ref is stored
+                in the external storage.
+        Return:
+            List of urls_with_offset of fused objects.
+            The order of returned keys are equivalent to the one
+            with given object_refs.
+        """
+        keys = []
+        offset = 0
+        ray_object_pairs = self._get_objects_from_store(object_refs)
+        for ref, (buf, metadata), owner_address in zip(
+            object_refs, ray_object_pairs, owner_addresses
+        ):
+            address_len = len(owner_address)
+            metadata_len = len(metadata)
+            if buf is None and len(metadata) == 0:
+                error = f"Object {ref.hex()} does not exist."
+                raise ValueError(error)
+            buf_len = 0 if buf is None else len(buf)
+            payload = (
+                address_len.to_bytes(8, byteorder="little")
+                + metadata_len.to_bytes(8, byteorder="little")
+                + buf_len.to_bytes(8, byteorder="little")
+                + owner_address
+                + metadata
+                + (memoryview(buf) if buf_len else b"")
+            )
+            # 24 bytes to store owner address, metadata, and buffer lengths.
+            payload_len = len(payload)
+            assert (
+                self.HEADER_LENGTH + address_len + metadata_len + buf_len == payload_len
+            )
+            written_bytes = f.write(payload)
+            assert written_bytes == payload_len
+            url_with_offset = create_url_with_offset(
+                url=url, offset=offset, size=written_bytes
+            )
+            keys.append(url_with_offset.encode())
+            offset += written_bytes
+        # Necessary because pyarrow.io.NativeFile does not flush() on close().
+        f.flush()
+        return keys
+    def _size_check(self, address_len, metadata_len, buffer_len, obtained_data_size):
+        """Check whether or not the obtained_data_size is as expected.
+        Args:
+             metadata_len: Actual metadata length of the object.
+             buffer_len: Actual buffer length of the object.
+             obtained_data_size: Data size specified in the
+                url_with_offset.
+        Raises:
+            ValueError if obtained_data_size is different from
+            address_len + metadata_len + buffer_len +
+            24 (first 8 bytes to store length).
+        """
+        data_size_in_bytes = (
+            address_len + metadata_len + buffer_len + self.HEADER_LENGTH
+        )
+        if data_size_in_bytes != obtained_data_size:
+            raise ValueError(
+                f"Obtained data has a size of {data_size_in_bytes}, "
+                "although it is supposed to have the "
+                f"size of {obtained_data_size}."
+            )
+    @abc.abstractmethod
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        """Spill objects to the external storage. Objects are specified
+        by their object refs.
+        Args:
+            object_refs: The list of the refs of the objects to be spilled.
+            owner_addresses: Owner addresses for the provided objects.
+        Returns:
+            A list of internal URLs with object offset.
+        """
+    @abc.abstractmethod
+    def restore_spilled_objects(
+        self, object_refs: List[ObjectRef], url_with_offset_list: List[str]
+    ) -> int:
+        """Restore objects from the external storage.
+        Args:
+            object_refs: List of object IDs (note that it is not ref).
+            url_with_offset_list: List of url_with_offset.
+        Returns:
+            The total number of bytes restored.
+        """
+    @abc.abstractmethod
+    def delete_spilled_objects(self, urls: List[str]):
+        """Delete objects that are spilled to the external storage.
+        Args:
+            urls: URLs that store spilled object files.
+        NOTE: This function should not fail if some of the urls
+        do not exist.
+        """
+    @abc.abstractmethod
+    def destroy_external_storage(self):
+        """Destroy external storage when a head node is down.
+        NOTE: This is currently working when the cluster is
+        started by ray.init
+        """
+class NullStorage(ExternalStorage):
+    """The class that represents an uninitialized external storage."""
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        raise NotImplementedError("External storage is not initialized")
+    def restore_spilled_objects(self, object_refs, url_with_offset_list):
+        raise NotImplementedError("External storage is not initialized")
+    def delete_spilled_objects(self, urls: List[str]):
+        raise NotImplementedError("External storage is not initialized")
+    def destroy_external_storage(self):
+        raise NotImplementedError("External storage is not initialized")
+class FileSystemStorage(ExternalStorage):
+    """The class for filesystem-like external storage.
+    Raises:
+        ValueError: Raises directory path to
+            spill objects doesn't exist.
+    """
+    def __init__(
+        self,
+        node_id: str,
+        directory_path: Union[str, List[str]],
+        buffer_size: Optional[int] = None,
+    ):
+        # -- A list of directory paths to spill objects --
+        self._directory_paths = []
+        # -- Current directory to spill objects --
+        self._current_directory_index = 0
+        # -- File buffer size to spill objects --
+        self._buffer_size = -1
+        # Validation.
+        assert (
+            directory_path is not None
+        ), "directory_path should be provided to use object spilling."
+        if isinstance(directory_path, str):
+            directory_path = [directory_path]
+        assert isinstance(
+            directory_path, list
+        ), "Directory_path must be either a single string or a list of strings"
+        if buffer_size is not None:
+            assert isinstance(buffer_size, int), "buffer_size must be an integer."
+            self._buffer_size = buffer_size
+        # Create directories.
+        for path in directory_path:
+            full_dir_path = os.path.join(path, f"{DEFAULT_OBJECT_PREFIX}_{node_id}")
+            os.makedirs(full_dir_path, exist_ok=True)
+            if not os.path.exists(full_dir_path):
+                raise ValueError(
+                    "The given directory path to store objects, "
+                    f"{full_dir_path}, could not be created."
+                )
+            self._directory_paths.append(full_dir_path)
+        assert len(self._directory_paths) == len(directory_path)
+        # Choose the current directory.
+        # It chooses a random index to maximize multiple directories that are
+        # mounted at different point.
+        self._current_directory_index = random.randrange(0, len(self._directory_paths))
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        if len(object_refs) == 0:
+            return []
+        # Choose the current directory path by round robin order.
+        self._current_directory_index = (self._current_directory_index + 1) % len(
+            self._directory_paths
+        )
+        directory_path = self._directory_paths[self._current_directory_index]
+        filename = _get_unique_spill_filename(object_refs)
+        url = f"{os.path.join(directory_path, filename)}"
+        with open(url, "wb", buffering=self._buffer_size) as f:
+            return self._write_multiple_objects(f, object_refs, owner_addresses, url)
+    def restore_spilled_objects(
+        self, object_refs: List[ObjectRef], url_with_offset_list: List[str]
+    ):
+        total = 0
+        for i in range(len(object_refs)):
+            object_ref = object_refs[i]
+            url_with_offset = url_with_offset_list[i].decode()
+            # Retrieve the information needed.
+            parsed_result = parse_url_with_offset(url_with_offset)
+            base_url = parsed_result.base_url
+            offset = parsed_result.offset
+            # Read a part of the file and recover the object.
+            with open(base_url, "rb") as f:
+                f.seek(offset)
+                address_len = int.from_bytes(f.read(8), byteorder="little")
+                metadata_len = int.from_bytes(f.read(8), byteorder="little")
+                buf_len = int.from_bytes(f.read(8), byteorder="little")
+                self._size_check(address_len, metadata_len, buf_len, parsed_result.size)
+                total += buf_len
+                owner_address = f.read(address_len)
+                metadata = f.read(metadata_len)
+                # read remaining data to our buffer
+                self._put_object_to_store(
+                    metadata, buf_len, f, object_ref, owner_address
+                )
+        return total
+    def delete_spilled_objects(self, urls: List[str]):
+        for url in urls:
+            path = parse_url_with_offset(url.decode()).base_url
+            try:
+                os.remove(path)
+            except FileNotFoundError:
+                # Occurs when the urls are retried during worker crash/failure.
+                pass
+    def destroy_external_storage(self):
+        for directory_path in self._directory_paths:
+            self._destroy_external_storage(directory_path)
+    def _destroy_external_storage(self, directory_path):
+        # There's a race condition where IO workers are still
+        # deleting each objects while we try deleting the
+        # whole directory. So we should keep trying it until
+        # The directory is actually deleted.
+        while os.path.isdir(directory_path):
+            try:
+                shutil.rmtree(directory_path)
+            except (FileNotFoundError):
+                # If exception occurs when other IO workers are
+                # deleting the file at the same time.
+                pass
+            except Exception:
+                logger.exception(
+                    "Error cleaning up spill files. "
+                    "You might still have remaining spilled "
+                    "objects inside `ray_spilled_objects` directory."
+                )
+                break
+class ExternalStorageRayStorageImpl(ExternalStorage):
+    """Implements the external storage interface using the ray storage API."""
+    def __init__(
+        self,
+        node_id: str,
+        session_name: str,
+        # For remote spilling, at least 1MB is recommended.
+        buffer_size=1024 * 1024,
+        # Override the storage config for unit tests.
+        _force_storage_for_testing: Optional[str] = None,
+    ):
+        from ray._private import storage
+        if _force_storage_for_testing:
+            storage._reset()
+            storage._init_storage(_force_storage_for_testing, True)
+        self._fs, storage_prefix = storage._get_filesystem_internal()
+        self._buffer_size = buffer_size
+        self._prefix = os.path.join(
+            storage_prefix, f"{DEFAULT_OBJECT_PREFIX}_{node_id}", session_name
+        )
+        self._fs.create_dir(self._prefix)
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        if len(object_refs) == 0:
+            return []
+        filename = _get_unique_spill_filename(object_refs)
+        url = f"{os.path.join(self._prefix, filename)}"
+        with self._fs.open_output_stream(url, buffer_size=self._buffer_size) as f:
+            return self._write_multiple_objects(f, object_refs, owner_addresses, url)
+    def restore_spilled_objects(
+        self, object_refs: List[ObjectRef], url_with_offset_list: List[str]
+    ):
+        total = 0
+        for i in range(len(object_refs)):
+            object_ref = object_refs[i]
+            url_with_offset = url_with_offset_list[i].decode()
+            # Retrieve the information needed.
+            parsed_result = parse_url_with_offset(url_with_offset)
+            base_url = parsed_result.base_url
+            offset = parsed_result.offset
+            # Read a part of the file and recover the object.
+            with self._fs.open_input_file(base_url) as f:
+                f.seek(offset)
+                address_len = int.from_bytes(f.read(8), byteorder="little")
+                metadata_len = int.from_bytes(f.read(8), byteorder="little")
+                buf_len = int.from_bytes(f.read(8), byteorder="little")
+                self._size_check(address_len, metadata_len, buf_len, parsed_result.size)
+                total += buf_len
+                owner_address = f.read(address_len)
+                metadata = f.read(metadata_len)
+                # read remaining data to our buffer
+                self._put_object_to_store(
+                    metadata, buf_len, f, object_ref, owner_address
+                )
+        return total
+    def delete_spilled_objects(self, urls: List[str]):
+        for url in urls:
+            path = parse_url_with_offset(url.decode()).base_url
+            try:
+                self._fs.delete_file(path)
+            except FileNotFoundError:
+                # Occurs when the urls are retried during worker crash/failure.
+                pass
+    def destroy_external_storage(self):
+        try:
+            self._fs.delete_dir(self._prefix)
+        except Exception:
+            logger.exception(
+                "Error cleaning up spill files. "
+                "You might still have remaining spilled "
+                "objects inside `{}`.".format(self._prefix)
+            )
+class ExternalStorageSmartOpenImpl(ExternalStorage):
+    """The external storage class implemented by smart_open.
+    (https://github.com/RaRe-Technologies/smart_open)
+    Smart open supports multiple backend with the same APIs.
+    To use this implementation, you should pre-create the given uri.
+    For example, if your uri is a local file path, you should pre-create
+    the directory.
+    Args:
+        uri: Storage URI used for smart open.
+        prefix: Prefix of objects that are stored.
+        override_transport_params: Overriding the default value of
+            transport_params for smart-open library.
+    Raises:
+        ModuleNotFoundError: If it fails to setup.
+            For example, if smart open library
+            is not downloaded, this will fail.
+    """
+    def __init__(
+        self,
+        node_id: str,
+        uri: str or list,
+        override_transport_params: dict = None,
+        buffer_size=1024 * 1024,  # For remote spilling, at least 1MB is recommended.
+    ):
+        try:
+            from smart_open import open  # noqa
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(
+                "Smart open is chosen to be a object spilling "
+                "external storage, but smart_open and boto3 "
+                f"is not downloaded. Original error: {e}"
+            )
+        # Validation
+        assert uri is not None, "uri should be provided to use object spilling."
+        if isinstance(uri, str):
+            uri = [uri]
+        assert isinstance(uri, list), "uri must be a single string or list of strings."
+        assert isinstance(buffer_size, int), "buffer_size must be an integer."
+        uri_is_s3 = [u.startswith("s3://") for u in uri]
+        self.is_for_s3 = all(uri_is_s3)
+        if not self.is_for_s3:
+            assert not any(uri_is_s3), "all uri's must be s3 or none can be s3."
+            self._uris = uri
+        else:
+            self._uris = [u.strip("/") for u in uri]
+        assert len(self._uris) == len(uri)
+        self._current_uri_index = random.randrange(0, len(self._uris))
+        self.prefix = f"{DEFAULT_OBJECT_PREFIX}_{node_id}"
+        self.override_transport_params = override_transport_params or {}
+        if self.is_for_s3:
+            import boto3  # noqa
+            # Setup boto3. It is essential because if we don't create boto
+            # session, smart_open will create a new session for every
+            # open call.
+            self.s3 = boto3.resource(service_name="s3")
+            # smart_open always seek to 0 if we don't set this argument.
+            # This will lead us to call a Object.get when it is not necessary,
+            # so defer seek and call seek before reading objects instead.
+            self.transport_params = {
+                "defer_seek": True,
+                "resource": self.s3,
+                "buffer_size": buffer_size,
+            }
+        else:
+            self.transport_params = {}
+        self.transport_params.update(self.override_transport_params)
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        if len(object_refs) == 0:
+            return []
+        from smart_open import open
+        # Choose the current uri by round robin order.
+        self._current_uri_index = (self._current_uri_index + 1) % len(self._uris)
+        uri = self._uris[self._current_uri_index]
+        key = f"{self.prefix}-{_get_unique_spill_filename(object_refs)}"
+        url = f"{uri}/{key}"
+        with open(
+            url,
+            mode="wb",
+            transport_params=self.transport_params,
+        ) as file_like:
+            return self._write_multiple_objects(
+                file_like, object_refs, owner_addresses, url
+            )
+    def restore_spilled_objects(
+        self, object_refs: List[ObjectRef], url_with_offset_list: List[str]
+    ):
+        from smart_open import open
+        total = 0
+        for i in range(len(object_refs)):
+            object_ref = object_refs[i]
+            url_with_offset = url_with_offset_list[i].decode()
+            # Retrieve the information needed.
+            parsed_result = parse_url_with_offset(url_with_offset)
+            base_url = parsed_result.base_url
+            offset = parsed_result.offset
+            with open(base_url, "rb", transport_params=self.transport_params) as f:
+                # smart open seek reads the file from offset-end_of_the_file
+                # when the seek is called.
+                f.seek(offset)
+                address_len = int.from_bytes(f.read(8), byteorder="little")
+                metadata_len = int.from_bytes(f.read(8), byteorder="little")
+                buf_len = int.from_bytes(f.read(8), byteorder="little")
+                self._size_check(address_len, metadata_len, buf_len, parsed_result.size)
+                owner_address = f.read(address_len)
+                total += buf_len
+                metadata = f.read(metadata_len)
+                # read remaining data to our buffer
+                self._put_object_to_store(
+                    metadata, buf_len, f, object_ref, owner_address
+                )
+        return total
+    def delete_spilled_objects(self, urls: List[str]):
+        pass
+    def destroy_external_storage(self):
+        pass
+_external_storage = NullStorage()
+class UnstableFileStorage(FileSystemStorage):
+    """This class is for testing with writing failure."""
+    def __init__(self, node_id: str, **kwargs):
+        super().__init__(node_id, **kwargs)
+        self._failure_rate = 0.1
+        self._partial_failure_ratio = 0.2
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        r = random.random() < self._failure_rate
+        failed = r < self._failure_rate
+        partial_failed = r < self._partial_failure_ratio
+        if failed:
+            raise IOError("Spilling object failed")
+        elif partial_failed:
+            i = random.choice(range(len(object_refs)))
+            return super().spill_objects(object_refs[:i], owner_addresses)
+        else:
+            return super().spill_objects(object_refs, owner_addresses)
+class SlowFileStorage(FileSystemStorage):
+    """This class is for testing slow object spilling."""
+    def __init__(self, node_id: str, **kwargs):
+        super().__init__(node_id, **kwargs)
+        self._min_delay = 1
+        self._max_delay = 2
+    def spill_objects(self, object_refs, owner_addresses) -> List[str]:
+        delay = random.random() * (self._max_delay - self._min_delay) + self._min_delay
+        time.sleep(delay)
+        return super().spill_objects(object_refs, owner_addresses)
+def setup_external_storage(config, node_id, session_name):
+    """Setup the external storage according to the config."""
+    assert node_id is not None, "node_id should be provided."
+    global _external_storage
+    if config:
+        storage_type = config["type"]
+        if storage_type == "filesystem":
+            _external_storage = FileSystemStorage(node_id, **config["params"])
+        elif storage_type == "ray_storage":
+            _external_storage = ExternalStorageRayStorageImpl(
+                node_id, session_name, **config["params"]
+            )
+        elif storage_type == "smart_open":
+            _external_storage = ExternalStorageSmartOpenImpl(
+                node_id, **config["params"]
+            )
+        elif storage_type == "mock_distributed_fs":
+            # This storage is used to unit test distributed external storages.
+            # TODO(sang): Delete it after introducing the mock S3 test.
+            _external_storage = FileSystemStorage(node_id, **config["params"])
+        elif storage_type == "unstable_fs":
+            # This storage is used to unit test unstable file system for fault
+            # tolerance.
+            _external_storage = UnstableFileStorage(node_id, **config["params"])
+        elif storage_type == "slow_fs":
+            # This storage is used to unit test slow filesystems.
+            _external_storage = SlowFileStorage(node_id, **config["params"])
+        else:
+            raise ValueError(f"Unknown external storage type: {storage_type}")
+    else:
+        _external_storage = NullStorage()
+    return _external_storage
+def reset_external_storage():
+    global _external_storage
+    _external_storage = NullStorage()
+def spill_objects(object_refs, owner_addresses):
+    """Spill objects to the external storage. Objects are specified
+    by their object refs.
+    Args:
+        object_refs: The list of the refs of the objects to be spilled.
+        owner_addresses: The owner addresses of the provided object refs.
+    Returns:
+        A list of keys corresponding to the input object refs.
+    """
+    return _external_storage.spill_objects(object_refs, owner_addresses)
+def restore_spilled_objects(
+    object_refs: List[ObjectRef], url_with_offset_list: List[str]
+):
+    """Restore objects from the external storage.
+    Args:
+        object_refs: List of object IDs (note that it is not ref).
+        url_with_offset_list: List of url_with_offset.
+    """
+    return _external_storage.restore_spilled_objects(object_refs, url_with_offset_list)
+def delete_spilled_objects(urls: List[str]):
+    """Delete objects that are spilled to the external storage.
+    Args:
+        urls: URLs that store spilled object files.
+    """
+    _external_storage.delete_spilled_objects(urls)
+def _get_unique_spill_filename(object_refs: List[ObjectRef]):
+    """Generate a unqiue spill file name.
+    Args:
+        object_refs: objects to be spilled in this file.
+    """
+    return f"{uuid.uuid4().hex}-multi-{len(object_refs)}"

.venv/lib/python3.11/site-packages/ray/_private/function_manager.py ADDED Viewed

	@@ -0,0 +1,706 @@

+import dis
+import sys
+import hashlib
+import importlib
+import inspect
+import json
+import logging
+import os
+import threading
+import time
+import traceback
+from collections import defaultdict, namedtuple
+from typing import Optional, Callable
+import ray
+from ray.remote_function import RemoteFunction
+import ray._private.profiling as profiling
+from ray import cloudpickle as pickle
+from ray._private import ray_constants
+from ray._private.inspect_util import (
+    is_class_method,
+    is_function_or_method,
+    is_static_method,
+)
+from ray._private.ray_constants import KV_NAMESPACE_FUNCTION_TABLE
+from ray._private.utils import (
+    check_oversized_function,
+    ensure_str,
+    format_error_message,
+)
+from ray._private.serialization import pickle_dumps
+from ray._raylet import (
+    JobID,
+    PythonFunctionDescriptor,
+    WORKER_PROCESS_SETUP_HOOK_KEY_NAME_GCS,
+)
+FunctionExecutionInfo = namedtuple(
+    "FunctionExecutionInfo", ["function", "function_name", "max_calls"]
+)
+ImportedFunctionInfo = namedtuple(
+    "ImportedFunctionInfo",
+    ["job_id", "function_id", "function_name", "function", "module", "max_calls"],
+)
+"""FunctionExecutionInfo: A named tuple storing remote function information."""
+logger = logging.getLogger(__name__)
+def make_function_table_key(key_type: bytes, job_id: JobID, key: Optional[bytes]):
+    if key is None:
+        return b":".join([key_type, job_id.hex().encode()])
+    else:
+        return b":".join([key_type, job_id.hex().encode(), key])
+class FunctionActorManager:
+    """A class used to export/load remote functions and actors.
+    Attributes:
+        _worker: The associated worker that this manager related.
+        _functions_to_export: The remote functions to export when
+            the worker gets connected.
+        _actors_to_export: The actors to export when the worker gets
+            connected.
+        _function_execution_info: The function_id
+            and execution_info.
+        _num_task_executions: The function
+            execution times.
+        imported_actor_classes: The set of actor classes keys (format:
+            ActorClass:function_id) that are already in GCS.
+    """
+    def __init__(self, worker):
+        self._worker = worker
+        self._functions_to_export = []
+        self._actors_to_export = []
+        # This field is a dictionary that maps function IDs
+        # to a FunctionExecutionInfo object. This should only be used on
+        # workers that execute remote functions.
+        self._function_execution_info = defaultdict(lambda: {})
+        self._num_task_executions = defaultdict(lambda: {})
+        # A set of all of the actor class keys that have been imported by the
+        # import thread. It is safe to convert this worker into an actor of
+        # these types.
+        self.imported_actor_classes = set()
+        self._loaded_actor_classes = {}
+        # Deserialize an ActorHandle will call load_actor_class(). If a
+        # function closure captured an ActorHandle, the deserialization of the
+        # function will be:
+        #         -> fetch_and_register_remote_function (acquire lock)
+        #         -> _load_actor_class_from_gcs (acquire lock, too)
+        # So, the lock should be a reentrant lock.
+        self.lock = threading.RLock()
+        self.execution_infos = {}
+        # This is the counter to keep track of how many keys have already
+        # been exported so that we can find next key quicker.
+        self._num_exported = 0
+        # This is to protect self._num_exported when doing exporting
+        self._export_lock = threading.Lock()
+    def increase_task_counter(self, function_descriptor):
+        function_id = function_descriptor.function_id
+        self._num_task_executions[function_id] += 1
+    def get_task_counter(self, function_descriptor):
+        function_id = function_descriptor.function_id
+        return self._num_task_executions[function_id]
+    def compute_collision_identifier(self, function_or_class):
+        """The identifier is used to detect excessive duplicate exports.
+        The identifier is used to determine when the same function or class is
+        exported many times. This can yield false positives.
+        Args:
+            function_or_class: The function or class to compute an identifier
+                for.
+        Returns:
+            The identifier. Note that different functions or classes can give
+                rise to same identifier. However, the same function should
+                hopefully always give rise to the same identifier. TODO(rkn):
+                verify if this is actually the case. Note that if the
+                identifier is incorrect in any way, then we may give warnings
+                unnecessarily or fail to give warnings, but the application's
+                behavior won't change.
+        """
+        import io
+        string_file = io.StringIO()
+        dis.dis(function_or_class, file=string_file, depth=2)
+        collision_identifier = function_or_class.__name__ + ":" + string_file.getvalue()
+        # Return a hash of the identifier in case it is too large.
+        return hashlib.sha1(collision_identifier.encode("utf-8")).digest()
+    def load_function_or_class_from_local(self, module_name, function_or_class_name):
+        """Try to load a function or class in the module from local."""
+        module = importlib.import_module(module_name)
+        parts = [part for part in function_or_class_name.split(".") if part]
+        object = module
+        try:
+            for part in parts:
+                object = getattr(object, part)
+            return object
+        except Exception:
+            return None
+    def export_setup_func(
+        self, setup_func: Callable, timeout: Optional[int] = None
+    ) -> bytes:
+        """Export the setup hook function and return the key."""
+        pickled_function = pickle_dumps(
+            setup_func,
+            "Cannot serialize the worker_process_setup_hook " f"{setup_func.__name__}",
+        )
+        function_to_run_id = hashlib.shake_128(pickled_function).digest(
+            ray_constants.ID_SIZE
+        )
+        key = make_function_table_key(
+            # This value should match with gcs_function_manager.h.
+            # Otherwise, it won't be GC'ed.
+            WORKER_PROCESS_SETUP_HOOK_KEY_NAME_GCS.encode(),
+            # b"FunctionsToRun",
+            self._worker.current_job_id.binary(),
+            function_to_run_id,
+        )
+        check_oversized_function(
+            pickled_function, setup_func.__name__, "function", self._worker
+        )
+        try:
+            self._worker.gcs_client.internal_kv_put(
+                key,
+                pickle.dumps(
+                    {
+                        "job_id": self._worker.current_job_id.binary(),
+                        "function_id": function_to_run_id,
+                        "function": pickled_function,
+                    }
+                ),
+                # overwrite
+                True,
+                ray_constants.KV_NAMESPACE_FUNCTION_TABLE,
+                timeout=timeout,
+            )
+        except Exception as e:
+            logger.exception(
+                "Failed to export the setup hook " f"{setup_func.__name__}."
+            )
+            raise e
+        return key
+    def export(self, remote_function):
+        """Pickle a remote function and export it to redis.
+        Args:
+            remote_function: the RemoteFunction object.
+        """
+        if self._worker.load_code_from_local:
+            function_descriptor = remote_function._function_descriptor
+            module_name, function_name = (
+                function_descriptor.module_name,
+                function_descriptor.function_name,
+            )
+            # If the function is dynamic, we still export it to GCS
+            # even if load_code_from_local is set True.
+            if (
+                self.load_function_or_class_from_local(module_name, function_name)
+                is not None
+            ):
+                return
+        function = remote_function._function
+        pickled_function = remote_function._pickled_function
+        check_oversized_function(
+            pickled_function,
+            remote_function._function_name,
+            "remote function",
+            self._worker,
+        )
+        key = make_function_table_key(
+            b"RemoteFunction",
+            self._worker.current_job_id,
+            remote_function._function_descriptor.function_id.binary(),
+        )
+        if self._worker.gcs_client.internal_kv_exists(key, KV_NAMESPACE_FUNCTION_TABLE):
+            return
+        val = pickle.dumps(
+            {
+                "job_id": self._worker.current_job_id.binary(),
+                "function_id": remote_function._function_descriptor.function_id.binary(),  # noqa: E501
+                "function_name": remote_function._function_name,
+                "module": function.__module__,
+                "function": pickled_function,
+                "collision_identifier": self.compute_collision_identifier(function),
+                "max_calls": remote_function._max_calls,
+            }
+        )
+        self._worker.gcs_client.internal_kv_put(
+            key, val, True, KV_NAMESPACE_FUNCTION_TABLE
+        )
+    def fetch_registered_method(
+        self, key: str, timeout: Optional[int] = None
+    ) -> Optional[ImportedFunctionInfo]:
+        vals = self._worker.gcs_client.internal_kv_get(
+            key, KV_NAMESPACE_FUNCTION_TABLE, timeout=timeout
+        )
+        if vals is None:
+            return None
+        else:
+            vals = pickle.loads(vals)
+            fields = [
+                "job_id",
+                "function_id",
+                "function_name",
+                "function",
+                "module",
+                "max_calls",
+            ]
+            return ImportedFunctionInfo._make(vals.get(field) for field in fields)
+    def fetch_and_register_remote_function(self, key):
+        """Import a remote function."""
+        remote_function_info = self.fetch_registered_method(key)
+        if not remote_function_info:
+            return False
+        (
+            job_id_str,
+            function_id_str,
+            function_name,
+            serialized_function,
+            module,
+            max_calls,
+        ) = remote_function_info
+        function_id = ray.FunctionID(function_id_str)
+        job_id = ray.JobID(job_id_str)
+        max_calls = int(max_calls)
+        # This function is called by ImportThread. This operation needs to be
+        # atomic. Otherwise, there is race condition. Another thread may use
+        # the temporary function above before the real function is ready.
+        with self.lock:
+            self._num_task_executions[function_id] = 0
+            try:
+                function = pickle.loads(serialized_function)
+            except Exception:
+                # If an exception was thrown when the remote function was
+                # imported, we record the traceback and notify the scheduler
+                # of the failure.
+                traceback_str = format_error_message(traceback.format_exc())
+                def f(*args, **kwargs):
+                    raise RuntimeError(
+                        "The remote function failed to import on the "
+                        "worker. This may be because needed library "
+                        "dependencies are not installed in the worker "
+                        "environment or cannot be found from sys.path "
+                        f"{sys.path}:\n\n{traceback_str}"
+                    )
+                # Use a placeholder method when function pickled failed
+                self._function_execution_info[function_id] = FunctionExecutionInfo(
+                    function=f, function_name=function_name, max_calls=max_calls
+                )
+                # Log the error message. Log at DEBUG level to avoid overly
+                # spamming the log on import failure. The user gets the error
+                # via the RuntimeError message above.
+                logger.debug(
+                    "Failed to unpickle the remote function "
+                    f"'{function_name}' with "
+                    f"function ID {function_id.hex()}. "
+                    f"Job ID:{job_id}."
+                    f"Traceback:\n{traceback_str}. "
+                )
+            else:
+                # The below line is necessary. Because in the driver process,
+                # if the function is defined in the file where the python
+                # script was started from, its module is `__main__`.
+                # However in the worker process, the `__main__` module is a
+                # different module, which is `default_worker.py`
+                function.__module__ = module
+                self._function_execution_info[function_id] = FunctionExecutionInfo(
+                    function=function, function_name=function_name, max_calls=max_calls
+                )
+        return True
+    def get_execution_info(self, job_id, function_descriptor):
+        """Get the FunctionExecutionInfo of a remote function.
+        Args:
+            job_id: ID of the job that the function belongs to.
+            function_descriptor: The FunctionDescriptor of the function to get.
+        Returns:
+            A FunctionExecutionInfo object.
+        """
+        function_id = function_descriptor.function_id
+        # If the function has already been loaded,
+        # There's no need to load again
+        if function_id in self._function_execution_info:
+            return self._function_execution_info[function_id]
+        if self._worker.load_code_from_local:
+            # Load function from local code.
+            if not function_descriptor.is_actor_method():
+                # If the function is not able to be loaded,
+                # try to load it from GCS,
+                # even if load_code_from_local is set True
+                if self._load_function_from_local(function_descriptor) is True:
+                    return self._function_execution_info[function_id]
+        # Load function from GCS.
+        # Wait until the function to be executed has actually been
+        # registered on this worker. We will push warnings to the user if
+        # we spend too long in this loop.
+        # The driver function may not be found in sys.path. Try to load
+        # the function from GCS.
+        with profiling.profile("wait_for_function"):
+            self._wait_for_function(function_descriptor, job_id)
+        try:
+            function_id = function_descriptor.function_id
+            info = self._function_execution_info[function_id]
+        except KeyError as e:
+            message = (
+                "Error occurs in get_execution_info: "
+                "job_id: %s, function_descriptor: %s. Message: %s"
+                % (job_id, function_descriptor, e)
+            )
+            raise KeyError(message)
+        return info
+    def _load_function_from_local(self, function_descriptor):
+        assert not function_descriptor.is_actor_method()
+        function_id = function_descriptor.function_id
+        module_name, function_name = (
+            function_descriptor.module_name,
+            function_descriptor.function_name,
+        )
+        object = self.load_function_or_class_from_local(module_name, function_name)
+        if object is not None:
+            # Directly importing from local may break function with dynamic ray.remote,
+            # such as the _start_controller function utilized for the Ray service.
+            if isinstance(object, RemoteFunction):
+                function = object._function
+            else:
+                function = object
+            self._function_execution_info[function_id] = FunctionExecutionInfo(
+                function=function,
+                function_name=function_name,
+                max_calls=0,
+            )
+            self._num_task_executions[function_id] = 0
+            return True
+        else:
+            return False
+    def _wait_for_function(self, function_descriptor, job_id: str, timeout=10):
+        """Wait until the function to be executed is present on this worker.
+        This method will simply loop until the import thread has imported the
+        relevant function. If we spend too long in this loop, that may indicate
+        a problem somewhere and we will push an error message to the user.
+        If this worker is an actor, then this will wait until the actor has
+        been defined.
+        Args:
+            function_descriptor : The FunctionDescriptor of the function that
+                we want to execute.
+            job_id: The ID of the job to push the error message to
+                if this times out.
+        """
+        start_time = time.time()
+        # Only send the warning once.
+        warning_sent = False
+        while True:
+            with self.lock:
+                if self._worker.actor_id.is_nil():
+                    if function_descriptor.function_id in self._function_execution_info:
+                        break
+                    else:
+                        key = make_function_table_key(
+                            b"RemoteFunction",
+                            job_id,
+                            function_descriptor.function_id.binary(),
+                        )
+                        if self.fetch_and_register_remote_function(key) is True:
+                            break
+                else:
+                    assert not self._worker.actor_id.is_nil()
+                    # Actor loading will happen when execute_task is called.
+                    assert self._worker.actor_id in self._worker.actors
+                    break
+            if time.time() - start_time > timeout:
+                warning_message = (
+                    "This worker was asked to execute a function "
+                    f"that has not been registered ({function_descriptor}, "
+                    f"node={self._worker.node_ip_address}, "
+                    f"worker_id={self._worker.worker_id.hex()}, "
+                    f"pid={os.getpid()}). You may have to restart Ray."
+                )
+                if not warning_sent:
+                    logger.error(warning_message)
+                    ray._private.utils.push_error_to_driver(
+                        self._worker,
+                        ray_constants.WAIT_FOR_FUNCTION_PUSH_ERROR,
+                        warning_message,
+                        job_id=job_id,
+                    )
+                warning_sent = True
+            time.sleep(0.001)
+    def export_actor_class(
+        self, Class, actor_creation_function_descriptor, actor_method_names
+    ):
+        if self._worker.load_code_from_local:
+            module_name, class_name = (
+                actor_creation_function_descriptor.module_name,
+                actor_creation_function_descriptor.class_name,
+            )
+            # If the class is dynamic, we still export it to GCS
+            # even if load_code_from_local is set True.
+            if (
+                self.load_function_or_class_from_local(module_name, class_name)
+                is not None
+            ):
+                return
+        # `current_job_id` shouldn't be NIL, unless:
+        # 1) This worker isn't an actor;
+        # 2) And a previous task started a background thread, which didn't
+        #    finish before the task finished, and still uses Ray API
+        #    after that.
+        assert not self._worker.current_job_id.is_nil(), (
+            "You might have started a background thread in a non-actor "
+            "task, please make sure the thread finishes before the "
+            "task finishes."
+        )
+        job_id = self._worker.current_job_id
+        key = make_function_table_key(
+            b"ActorClass",
+            job_id,
+            actor_creation_function_descriptor.function_id.binary(),
+        )
+        serialized_actor_class = pickle_dumps(
+            Class,
+            f"Could not serialize the actor class "
+            f"{actor_creation_function_descriptor.repr}",
+        )
+        actor_class_info = {
+            "class_name": actor_creation_function_descriptor.class_name.split(".")[-1],
+            "module": actor_creation_function_descriptor.module_name,
+            "class": serialized_actor_class,
+            "job_id": job_id.binary(),
+            "collision_identifier": self.compute_collision_identifier(Class),
+            "actor_method_names": json.dumps(list(actor_method_names)),
+        }
+        check_oversized_function(
+            actor_class_info["class"],
+            actor_class_info["class_name"],
+            "actor",
+            self._worker,
+        )
+        self._worker.gcs_client.internal_kv_put(
+            key, pickle.dumps(actor_class_info), True, KV_NAMESPACE_FUNCTION_TABLE
+        )
+        # TODO(rkn): Currently we allow actor classes to be defined
+        # within tasks. I tried to disable this, but it may be necessary
+        # because of https://github.com/ray-project/ray/issues/1146.
+    def load_actor_class(self, job_id, actor_creation_function_descriptor):
+        """Load the actor class.
+        Args:
+            job_id: job ID of the actor.
+            actor_creation_function_descriptor: Function descriptor of
+                the actor constructor.
+        Returns:
+            The actor class.
+        """
+        function_id = actor_creation_function_descriptor.function_id
+        # Check if the actor class already exists in the cache.
+        actor_class = self._loaded_actor_classes.get(function_id, None)
+        if actor_class is None:
+            # Load actor class.
+            if self._worker.load_code_from_local:
+                # Load actor class from local code first.
+                actor_class = self._load_actor_class_from_local(
+                    actor_creation_function_descriptor
+                )
+                # If the actor is unable to be loaded
+                # from local, try to load it
+                # from GCS even if load_code_from_local is set True
+                if actor_class is None:
+                    actor_class = self._load_actor_class_from_gcs(
+                        job_id, actor_creation_function_descriptor
+                    )
+            else:
+                # Load actor class from GCS.
+                actor_class = self._load_actor_class_from_gcs(
+                    job_id, actor_creation_function_descriptor
+                )
+            # Save the loaded actor class in cache.
+            self._loaded_actor_classes[function_id] = actor_class
+            # Generate execution info for the methods of this actor class.
+            module_name = actor_creation_function_descriptor.module_name
+            actor_class_name = actor_creation_function_descriptor.class_name
+            actor_methods = inspect.getmembers(
+                actor_class, predicate=is_function_or_method
+            )
+            for actor_method_name, actor_method in actor_methods:
+                # Actor creation function descriptor use a unique function
+                # hash to solve actor name conflict. When constructing an
+                # actor, the actor creation function descriptor will be the
+                # key to find __init__ method execution info. So, here we
+                # use actor creation function descriptor as method descriptor
+                # for generating __init__ method execution info.
+                if actor_method_name == "__init__":
+                    method_descriptor = actor_creation_function_descriptor
+                else:
+                    method_descriptor = PythonFunctionDescriptor(
+                        module_name, actor_method_name, actor_class_name
+                    )
+                method_id = method_descriptor.function_id
+                executor = self._make_actor_method_executor(
+                    actor_method_name,
+                    actor_method,
+                    actor_imported=True,
+                )
+                self._function_execution_info[method_id] = FunctionExecutionInfo(
+                    function=executor,
+                    function_name=actor_method_name,
+                    max_calls=0,
+                )
+                self._num_task_executions[method_id] = 0
+            self._num_task_executions[function_id] = 0
+        return actor_class
+    def _load_actor_class_from_local(self, actor_creation_function_descriptor):
+        """Load actor class from local code."""
+        module_name, class_name = (
+            actor_creation_function_descriptor.module_name,
+            actor_creation_function_descriptor.class_name,
+        )
+        object = self.load_function_or_class_from_local(module_name, class_name)
+        if object is not None:
+            if isinstance(object, ray.actor.ActorClass):
+                return object.__ray_metadata__.modified_class
+            else:
+                return object
+        else:
+            return None
+    def _create_fake_actor_class(
+        self, actor_class_name, actor_method_names, traceback_str
+    ):
+        class TemporaryActor:
+            pass
+        def temporary_actor_method(*args, **kwargs):
+            raise RuntimeError(
+                f"The actor with name {actor_class_name} "
+                "failed to import on the worker. This may be because "
+                "needed library dependencies are not installed in the "
+                f"worker environment:\n\n{traceback_str}"
+            )
+        for method in actor_method_names:
+            setattr(TemporaryActor, method, temporary_actor_method)
+        return TemporaryActor
+    def _load_actor_class_from_gcs(self, job_id, actor_creation_function_descriptor):
+        """Load actor class from GCS."""
+        key = make_function_table_key(
+            b"ActorClass",
+            job_id,
+            actor_creation_function_descriptor.function_id.binary(),
+        )
+        # Fetch raw data from GCS.
+        vals = self._worker.gcs_client.internal_kv_get(key, KV_NAMESPACE_FUNCTION_TABLE)
+        fields = ["job_id", "class_name", "module", "class", "actor_method_names"]
+        if vals is None:
+            vals = {}
+        else:
+            vals = pickle.loads(vals)
+        (job_id_str, class_name, module, pickled_class, actor_method_names) = (
+            vals.get(field) for field in fields
+        )
+        class_name = ensure_str(class_name)
+        module_name = ensure_str(module)
+        job_id = ray.JobID(job_id_str)
+        actor_method_names = json.loads(ensure_str(actor_method_names))
+        actor_class = None
+        try:
+            with self.lock:
+                actor_class = pickle.loads(pickled_class)
+        except Exception:
+            logger.debug("Failed to load actor class %s.", class_name)
+            # If an exception was thrown when the actor was imported, we record
+            # the traceback and notify the scheduler of the failure.
+            traceback_str = format_error_message(traceback.format_exc())
+            # The actor class failed to be unpickled, create a fake actor
+            # class instead (just to produce error messages and to prevent
+            # the driver from hanging).
+            actor_class = self._create_fake_actor_class(
+                class_name, actor_method_names, traceback_str
+            )
+        # The below line is necessary. Because in the driver process,
+        # if the function is defined in the file where the python script
+        # was started from, its module is `__main__`.
+        # However in the worker process, the `__main__` module is a
+        # different module, which is `default_worker.py`
+        actor_class.__module__ = module_name
+        return actor_class
+    def _make_actor_method_executor(
+        self, method_name: str, method, actor_imported: bool
+    ):
+        """Make an executor that wraps a user-defined actor method.
+        The wrapped method updates the worker's internal state and performs any
+        necessary checkpointing operations.
+        Args:
+            method_name: The name of the actor method.
+            method: The actor method to wrap. This should be a
+                method defined on the actor class and should therefore take an
+                instance of the actor as the first argument.
+            actor_imported: Whether the actor has been imported.
+                Checkpointing operations will not be run if this is set to
+                False.
+        Returns:
+            A function that executes the given actor method on the worker's
+                stored instance of the actor. The function also updates the
+                worker's internal state to record the executed method.
+        """
+        def actor_method_executor(__ray_actor, *args, **kwargs):
+            # Execute the assigned method.
+            is_bound = is_class_method(method) or is_static_method(
+                type(__ray_actor), method_name
+            )
+            if is_bound:
+                return method(*args, **kwargs)
+            else:
+                return method(__ray_actor, *args, **kwargs)
+        # Set method_name and method as attributes to the executor closure
+        # so we can make decision based on these attributes in task executor.
+        # Precisely, asyncio support requires to know whether:
+        # - the method is a ray internal method: starts with __ray
+        # - the method is a coroutine function: defined by async def
+        actor_method_executor.name = method_name
+        actor_method_executor.method = method
+        return actor_method_executor

.venv/lib/python3.11/site-packages/ray/_private/gcs_aio_client.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import logging
+from typing import Optional
+import ray
+from ray._raylet import InnerGcsClient
+logger = logging.getLogger(__name__)
+class GcsAioClient:
+    """
+    Async GCS client.
+    Historical note: there was a `ray::gcs::PythonGcsClient` C++ binding which has only
+    sync API and in Python we wrap it with ThreadPoolExecutor. It's been removed in
+    favor of `ray::gcs::GcsClient` which contains async API.
+    """
+    def __init__(
+        self,
+        address: str = None,
+        loop=None,
+        executor=None,
+        nums_reconnect_retry: int = 5,
+        cluster_id: Optional[str] = None,
+    ):
+        # This must be consistent with GcsClient.__cinit__ in _raylet.pyx
+        timeout_ms = ray._config.py_gcs_connect_timeout_s() * 1000
+        self.inner = InnerGcsClient.standalone(
+            str(address), cluster_id=cluster_id, timeout_ms=timeout_ms
+        )
+        # Forwarded Methods. Not using __getattr__ because we want one fewer layer of
+        # indirection.
+        self.internal_kv_get = self.inner.async_internal_kv_get
+        self.internal_kv_multi_get = self.inner.async_internal_kv_multi_get
+        self.internal_kv_put = self.inner.async_internal_kv_put
+        self.internal_kv_del = self.inner.async_internal_kv_del
+        self.internal_kv_exists = self.inner.async_internal_kv_exists
+        self.internal_kv_keys = self.inner.async_internal_kv_keys
+        self.check_alive = self.inner.async_check_alive
+        self.get_all_job_info = self.inner.async_get_all_job_info
+        # Forwarded Properties.
+        self.address = self.inner.address
+        self.cluster_id = self.inner.cluster_id
+        # Note: these only exists in the new client.
+        self.get_all_actor_info = self.inner.async_get_all_actor_info
+        self.get_all_node_info = self.inner.async_get_all_node_info
+        self.kill_actor = self.inner.async_kill_actor

.venv/lib/python3.11/site-packages/ray/_private/gcs_pubsub.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import asyncio
+from collections import deque
+import logging
+import random
+from typing import Tuple, List
+import grpc
+from ray._private.utils import get_or_create_event_loop
+try:
+    from grpc import aio as aiogrpc
+except ImportError:
+    from grpc.experimental import aio as aiogrpc
+import ray._private.gcs_utils as gcs_utils
+from ray.core.generated import gcs_service_pb2_grpc
+from ray.core.generated import gcs_service_pb2
+from ray.core.generated import gcs_pb2
+from ray.core.generated import common_pb2
+from ray.core.generated import pubsub_pb2
+logger = logging.getLogger(__name__)
+# Max retries for GCS publisher connection error
+MAX_GCS_PUBLISH_RETRIES = 60
+class _PublisherBase:
+    @staticmethod
+    def _create_node_resource_usage_request(key: str, json: str):
+        return gcs_service_pb2.GcsPublishRequest(
+            pub_messages=[
+                pubsub_pb2.PubMessage(
+                    channel_type=pubsub_pb2.RAY_NODE_RESOURCE_USAGE_CHANNEL,
+                    key_id=key.encode(),
+                    node_resource_usage_message=common_pb2.NodeResourceUsage(json=json),
+                )
+            ]
+        )
+class _SubscriberBase:
+    def __init__(self, worker_id: bytes = None):
+        self._worker_id = worker_id
+        # self._subscriber_id needs to match the binary format of a random
+        # SubscriberID / UniqueID, which is 28 (kUniqueIDSize) random bytes.
+        self._subscriber_id = bytes(bytearray(random.getrandbits(8) for _ in range(28)))
+        self._last_batch_size = 0
+        self._max_processed_sequence_id = 0
+        self._publisher_id = b""
+    # Batch size of the result from last poll. Used to indicate whether the
+    # subscriber can keep up.
+    @property
+    def last_batch_size(self):
+        return self._last_batch_size
+    def _subscribe_request(self, channel):
+        cmd = pubsub_pb2.Command(channel_type=channel, subscribe_message={})
+        req = gcs_service_pb2.GcsSubscriberCommandBatchRequest(
+            subscriber_id=self._subscriber_id, sender_id=self._worker_id, commands=[cmd]
+        )
+        return req
+    def _poll_request(self):
+        return gcs_service_pb2.GcsSubscriberPollRequest(
+            subscriber_id=self._subscriber_id,
+            max_processed_sequence_id=self._max_processed_sequence_id,
+            publisher_id=self._publisher_id,
+        )
+    def _unsubscribe_request(self, channels):
+        req = gcs_service_pb2.GcsSubscriberCommandBatchRequest(
+            subscriber_id=self._subscriber_id, sender_id=self._worker_id, commands=[]
+        )
+        for channel in channels:
+            req.commands.append(
+                pubsub_pb2.Command(channel_type=channel, unsubscribe_message={})
+            )
+        return req
+    @staticmethod
+    def _should_terminate_polling(e: grpc.RpcError) -> None:
+        # Caller only expects polling to be terminated after deadline exceeded.
+        if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
+            return True
+        # Could be a temporary connection issue. Suppress error.
+        # TODO: reconnect GRPC channel?
+        if e.code() == grpc.StatusCode.UNAVAILABLE:
+            return True
+        return False
+class GcsAioPublisher(_PublisherBase):
+    """Publisher to GCS. Uses async io."""
+    def __init__(self, address: str = None, channel: aiogrpc.Channel = None):
+        if address:
+            assert channel is None, "address and channel cannot both be specified"
+            channel = gcs_utils.create_gcs_channel(address, aio=True)
+        else:
+            assert channel is not None, "One of address and channel must be specified"
+        self._stub = gcs_service_pb2_grpc.InternalPubSubGcsServiceStub(channel)
+    async def publish_resource_usage(self, key: str, json: str) -> None:
+        """Publishes logs to GCS."""
+        req = self._create_node_resource_usage_request(key, json)
+        await self._stub.GcsPublish(req)
+class _AioSubscriber(_SubscriberBase):
+    """Async io subscriber to GCS.
+    Usage example common to Aio subscribers:
+        subscriber = GcsAioXxxSubscriber(address="...")
+        await subscriber.subscribe()
+        while running:
+            ...... = await subscriber.poll()
+            ......
+        await subscriber.close()
+    """
+    def __init__(
+        self,
+        pubsub_channel_type,
+        worker_id: bytes = None,
+        address: str = None,
+        channel: aiogrpc.Channel = None,
+    ):
+        super().__init__(worker_id)
+        if address:
+            assert channel is None, "address and channel cannot both be specified"
+            channel = gcs_utils.create_gcs_channel(address, aio=True)
+        else:
+            assert channel is not None, "One of address and channel must be specified"
+        # GRPC stub to GCS pubsub.
+        self._stub = gcs_service_pb2_grpc.InternalPubSubGcsServiceStub(channel)
+        # Type of the channel.
+        self._channel = pubsub_channel_type
+        # A queue of received PubMessage.
+        self._queue = deque()
+        # Indicates whether the subscriber has closed.
+        self._close = asyncio.Event()
+    async def subscribe(self) -> None:
+        """Registers a subscription for the subscriber's channel type.
+        Before the registration, published messages in the channel will not be
+        saved for the subscriber.
+        """
+        if self._close.is_set():
+            return
+        req = self._subscribe_request(self._channel)
+        await self._stub.GcsSubscriberCommandBatch(req, timeout=30)
+    async def _poll_call(self, req, timeout=None):
+        # Wrap GRPC _AioCall as a coroutine.
+        return await self._stub.GcsSubscriberPoll(req, timeout=timeout)
+    async def _poll(self, timeout=None) -> None:
+        while len(self._queue) == 0:
+            req = self._poll_request()
+            poll = get_or_create_event_loop().create_task(
+                self._poll_call(req, timeout=timeout)
+            )
+            close = get_or_create_event_loop().create_task(self._close.wait())
+            done, others = await asyncio.wait(
+                [poll, close], timeout=timeout, return_when=asyncio.FIRST_COMPLETED
+            )
+            # Cancel the other task if needed to prevent memory leak.
+            other_task = others.pop()
+            if not other_task.done():
+                other_task.cancel()
+            if poll not in done or close in done:
+                # Request timed out or subscriber closed.
+                break
+            try:
+                self._last_batch_size = len(poll.result().pub_messages)
+                if poll.result().publisher_id != self._publisher_id:
+                    if self._publisher_id != "":
+                        logger.debug(
+                            f"replied publisher_id {poll.result().publisher_id}"
+                            f"different from {self._publisher_id}, this should "
+                            "only happens during gcs failover."
+                        )
+                    self._publisher_id = poll.result().publisher_id
+                    self._max_processed_sequence_id = 0
+                for msg in poll.result().pub_messages:
+                    if msg.sequence_id <= self._max_processed_sequence_id:
+                        logger.warning(f"Ignoring out of order message {msg}")
+                        continue
+                    self._max_processed_sequence_id = msg.sequence_id
+                    self._queue.append(msg)
+            except grpc.RpcError as e:
+                if self._should_terminate_polling(e):
+                    return
+                raise
+    async def close(self) -> None:
+        """Closes the subscriber and its active subscription."""
+        # Mark close to terminate inflight polling and prevent future requests.
+        if self._close.is_set():
+            return
+        self._close.set()
+        req = self._unsubscribe_request(channels=[self._channel])
+        try:
+            await self._stub.GcsSubscriberCommandBatch(req, timeout=5)
+        except Exception:
+            pass
+        self._stub = None
+class GcsAioResourceUsageSubscriber(_AioSubscriber):
+    def __init__(
+        self,
+        worker_id: bytes = None,
+        address: str = None,
+        channel: grpc.Channel = None,
+    ):
+        super().__init__(
+            pubsub_pb2.RAY_NODE_RESOURCE_USAGE_CHANNEL, worker_id, address, channel
+        )
+    async def poll(self, timeout=None) -> Tuple[bytes, str]:
+        """Polls for new resource usage message.
+        Returns:
+            A tuple of string reporter ID and resource usage json string.
+        """
+        await self._poll(timeout=timeout)
+        return self._pop_resource_usage(self._queue)
+    @staticmethod
+    def _pop_resource_usage(queue):
+        if len(queue) == 0:
+            return None, None
+        msg = queue.popleft()
+        return msg.key_id.decode(), msg.node_resource_usage_message.json
+class GcsAioActorSubscriber(_AioSubscriber):
+    def __init__(
+        self,
+        worker_id: bytes = None,
+        address: str = None,
+        channel: grpc.Channel = None,
+    ):
+        super().__init__(pubsub_pb2.GCS_ACTOR_CHANNEL, worker_id, address, channel)
+    @property
+    def queue_size(self):
+        return len(self._queue)
+    async def poll(
+        self, batch_size, timeout=None
+    ) -> List[Tuple[bytes, gcs_pb2.ActorTableData]]:
+        """Polls for new actor message.
+        Returns:
+            A list of tuples of binary actor ID and actor table data.
+        """
+        await self._poll(timeout=timeout)
+        return self._pop_actors(self._queue, batch_size=batch_size)
+    @staticmethod
+    def _pop_actors(queue, batch_size):
+        if len(queue) == 0:
+            return []
+        popped = 0
+        msgs = []
+        while len(queue) > 0 and popped < batch_size:
+            msg = queue.popleft()
+            msgs.append((msg.key_id, msg.actor_message))
+            popped += 1
+        return msgs
+class GcsAioNodeInfoSubscriber(_AioSubscriber):
+    def __init__(
+        self,
+        worker_id: bytes = None,
+        address: str = None,
+        channel: grpc.Channel = None,
+    ):
+        super().__init__(pubsub_pb2.GCS_NODE_INFO_CHANNEL, worker_id, address, channel)
+    async def poll(
+        self, batch_size, timeout=None
+    ) -> List[Tuple[bytes, gcs_pb2.GcsNodeInfo]]:
+        """Polls for new node info message.
+        Returns:
+            A list of tuples of (node_id, GcsNodeInfo).
+        """
+        await self._poll(timeout=timeout)
+        return self._pop_node_infos(self._queue, batch_size=batch_size)
+    @staticmethod
+    def _pop_node_infos(queue, batch_size):
+        if len(queue) == 0:
+            return []
+        popped = 0
+        msgs = []
+        while len(queue) > 0 and popped < batch_size:
+            msg = queue.popleft()
+            msgs.append((msg.key_id, msg.node_info_message))
+            popped += 1
+        return msgs

.venv/lib/python3.11/site-packages/ray/_private/gcs_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import logging
+from typing import Optional
+from ray._private import ray_constants
+import ray._private.gcs_aio_client
+from ray.core.generated.common_pb2 import ErrorType, JobConfig
+from ray.core.generated.gcs_pb2 import (
+    ActorTableData,
+    AvailableResources,
+    TotalResources,
+    ErrorTableData,
+    GcsEntry,
+    GcsNodeInfo,
+    JobTableData,
+    PlacementGroupTableData,
+    PubSubMessage,
+    ResourceDemand,
+    ResourceLoad,
+    ResourcesData,
+    ResourceUsageBatchData,
+    TablePrefix,
+    TablePubsub,
+    TaskEvents,
+    WorkerTableData,
+)
+logger = logging.getLogger(__name__)
+__all__ = [
+    "ActorTableData",
+    "GcsNodeInfo",
+    "AvailableResources",
+    "TotalResources",
+    "JobTableData",
+    "JobConfig",
+    "ErrorTableData",
+    "ErrorType",
+    "GcsEntry",
+    "ResourceUsageBatchData",
+    "ResourcesData",
+    "TablePrefix",
+    "TablePubsub",
+    "TaskEvents",
+    "ResourceDemand",
+    "ResourceLoad",
+    "PubSubMessage",
+    "WorkerTableData",
+    "PlacementGroupTableData",
+]
+WORKER = 0
+DRIVER = 1
+# Cap messages at 512MB
+_MAX_MESSAGE_LENGTH = 512 * 1024 * 1024
+# Send keepalive every 60s
+_GRPC_KEEPALIVE_TIME_MS = 60 * 1000
+# Keepalive should be replied < 60s
+_GRPC_KEEPALIVE_TIMEOUT_MS = 60 * 1000
+# Also relying on these defaults:
+# grpc.keepalive_permit_without_calls=0: No keepalive without inflight calls.
+# grpc.use_local_subchannel_pool=0: Subchannels are shared.
+_GRPC_OPTIONS = [
+    *ray_constants.GLOBAL_GRPC_OPTIONS,
+    ("grpc.max_send_message_length", _MAX_MESSAGE_LENGTH),
+    ("grpc.max_receive_message_length", _MAX_MESSAGE_LENGTH),
+    ("grpc.keepalive_time_ms", _GRPC_KEEPALIVE_TIME_MS),
+    ("grpc.keepalive_timeout_ms", _GRPC_KEEPALIVE_TIMEOUT_MS),
+]
+def create_gcs_channel(address: str, aio=False):
+    """Returns a GRPC channel to GCS.
+    Args:
+        address: GCS address string, e.g. ip:port
+        aio: Whether using grpc.aio
+    Returns:
+        grpc.Channel or grpc.aio.Channel to GCS
+    """
+    from ray._private.utils import init_grpc_channel
+    return init_grpc_channel(address, options=_GRPC_OPTIONS, asynchronous=aio)
+class GcsChannel:
+    def __init__(self, gcs_address: Optional[str] = None, aio: bool = False):
+        self._gcs_address = gcs_address
+        self._aio = aio
+    @property
+    def address(self):
+        return self._gcs_address
+    def connect(self):
+        # GCS server uses a cached port, so it should use the same port after
+        # restarting. This means GCS address should stay the same for the
+        # lifetime of the Ray cluster.
+        self._channel = create_gcs_channel(self._gcs_address, self._aio)
+    def channel(self):
+        return self._channel
+# re-export
+GcsAioClient = ray._private.gcs_aio_client.GcsAioClient
+def cleanup_redis_storage(
+    host: str,
+    port: int,
+    password: str,
+    use_ssl: bool,
+    storage_namespace: str,
+    username: Optional[str] = None,
+):
+    """This function is used to cleanup the storage. Before we having
+    a good design for storage backend, it can be used to delete the old
+    data. It support redis cluster and non cluster mode.
+    Args:
+       host: The host address of the Redis.
+       port: The port of the Redis.
+       username: The username of the Redis.
+       password: The password of the Redis.
+       use_ssl: Whether to encrypt the connection.
+       storage_namespace: The namespace of the storage to be deleted.
+    """
+    from ray._raylet import del_key_prefix_from_storage  # type: ignore
+    if not isinstance(host, str):
+        raise ValueError("Host must be a string")
+    if username is None:
+        username = ""
+    if not isinstance(username, str):
+        raise ValueError("Username must be a string")
+    if not isinstance(password, str):
+        raise ValueError("Password must be a string")
+    if port < 0:
+        raise ValueError(f"Invalid port: {port}")
+    if not isinstance(use_ssl, bool):
+        raise TypeError("use_ssl must be a boolean")
+    if not isinstance(storage_namespace, str):
+        raise ValueError("storage namespace must be a string")
+    # Right now, GCS stores all data into multiple hashes with keys prefixed by
+    # storage_namespace. So we only need to delete the specific key prefix to cleanup
+    # the cluster.
+    # Note this deletes all keys with prefix `RAY{key_prefix}@`, not `{key_prefix}`.
+    return del_key_prefix_from_storage(
+        host, port, username, password, use_ssl, storage_namespace
+    )

.venv/lib/python3.11/site-packages/ray/_private/inspect_util.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import inspect
+def is_cython(obj):
+    """Check if an object is a Cython function or method"""
+    # TODO(suo): We could split these into two functions, one for Cython
+    # functions and another for Cython methods.
+    # TODO(suo): There doesn't appear to be a Cython function 'type' we can
+    # check against via isinstance. Please correct me if I'm wrong.
+    def check_cython(x):
+        return type(x).__name__ == "cython_function_or_method"
+    # Check if function or method, respectively
+    return check_cython(obj) or (
+        hasattr(obj, "__func__") and check_cython(obj.__func__)
+    )
+def is_function_or_method(obj):
+    """Check if an object is a function or method.
+    Args:
+        obj: The Python object in question.
+    Returns:
+        True if the object is an function or method.
+    """
+    return inspect.isfunction(obj) or inspect.ismethod(obj) or is_cython(obj)
+def is_class_method(f):
+    """Returns whether the given method is a class_method."""
+    return hasattr(f, "__self__") and f.__self__ is not None
+def is_static_method(cls, f_name):
+    """Returns whether the class has a static method with the given name.
+    Args:
+        cls: The Python class (i.e. object of type `type`) to
+            search for the method in.
+        f_name: The name of the method to look up in this class
+            and check whether or not it is static.
+    """
+    for base_cls in inspect.getmro(cls):
+        if f_name in base_cls.__dict__:
+            return isinstance(base_cls.__dict__[f_name], staticmethod)
+    return False

.venv/lib/python3.11/site-packages/ray/_private/internal_api.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from typing import List, Tuple
+import ray
+import ray._private.profiling as profiling
+import ray._private.services as services
+import ray._private.utils as utils
+import ray._private.worker
+from ray._private.state import GlobalState
+from ray._raylet import GcsClientOptions
+from ray.core.generated import common_pb2
+__all__ = ["free", "global_gc"]
+MAX_MESSAGE_LENGTH = ray._config.max_grpc_message_size()
+def global_gc():
+    """Trigger gc.collect() on all workers in the cluster."""
+    worker = ray._private.worker.global_worker
+    worker.core_worker.global_gc()
+def get_state_from_address(address=None):
+    address = services.canonicalize_bootstrap_address_or_die(address)
+    state = GlobalState()
+    options = GcsClientOptions.create(
+        address, None, allow_cluster_id_nil=True, fetch_cluster_id_if_nil=False
+    )
+    state._initialize_global_state(options)
+    return state
+def memory_summary(
+    address=None,
+    group_by="NODE_ADDRESS",
+    sort_by="OBJECT_SIZE",
+    units="B",
+    line_wrap=True,
+    stats_only=False,
+    num_entries=None,
+):
+    from ray.dashboard.memory_utils import memory_summary
+    state = get_state_from_address(address)
+    reply = get_memory_info_reply(state)
+    if stats_only:
+        return store_stats_summary(reply)
+    return memory_summary(
+        state, group_by, sort_by, line_wrap, units, num_entries
+    ) + store_stats_summary(reply)
+def get_memory_info_reply(state, node_manager_address=None, node_manager_port=None):
+    """Returns global memory info."""
+    from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc
+    # We can ask any Raylet for the global memory info, that Raylet internally
+    # asks all nodes in the cluster for memory stats.
+    if node_manager_address is None or node_manager_port is None:
+        # We should ask for a raylet that is alive.
+        raylet = None
+        for node in state.node_table():
+            if node["Alive"]:
+                raylet = node
+                break
+        assert raylet is not None, "Every raylet is dead"
+        raylet_address = "{}:{}".format(
+            raylet["NodeManagerAddress"], raylet["NodeManagerPort"]
+        )
+    else:
+        raylet_address = "{}:{}".format(node_manager_address, node_manager_port)
+    channel = utils.init_grpc_channel(
+        raylet_address,
+        options=[
+            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
+            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
+        ],
+    )
+    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
+    reply = stub.FormatGlobalMemoryInfo(
+        node_manager_pb2.FormatGlobalMemoryInfoRequest(include_memory_info=False),
+        timeout=60.0,
+    )
+    return reply
+def node_stats(
+    node_manager_address=None, node_manager_port=None, include_memory_info=True
+):
+    """Returns NodeStats object describing memory usage in the cluster."""
+    from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc
+    # We can ask any Raylet for the global memory info.
+    assert node_manager_address is not None and node_manager_port is not None
+    raylet_address = "{}:{}".format(node_manager_address, node_manager_port)
+    channel = utils.init_grpc_channel(
+        raylet_address,
+        options=[
+            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
+            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
+        ],
+    )
+    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
+    node_stats = stub.GetNodeStats(
+        node_manager_pb2.GetNodeStatsRequest(include_memory_info=include_memory_info),
+        timeout=30.0,
+    )
+    return node_stats
+def store_stats_summary(reply):
+    """Returns formatted string describing object store stats in all nodes."""
+    store_summary = "--- Aggregate object store stats across all nodes ---\n"
+    # TODO(ekl) it would be nice if we could provide a full memory usage
+    # breakdown by type (e.g., pinned by worker, primary, etc.)
+    store_summary += (
+        "Plasma memory usage {} MiB, {} objects, {}% full, {}% "
+        "needed\n".format(
+            int(reply.store_stats.object_store_bytes_used / (1024 * 1024)),
+            reply.store_stats.num_local_objects,
+            round(
+                100
+                * reply.store_stats.object_store_bytes_used
+                / reply.store_stats.object_store_bytes_avail,
+                2,
+            ),
+            round(
+                100
+                * reply.store_stats.object_store_bytes_primary_copy
+                / reply.store_stats.object_store_bytes_avail,
+                2,
+            ),
+        )
+    )
+    if reply.store_stats.object_store_bytes_fallback > 0:
+        store_summary += "Plasma filesystem mmap usage: {} MiB\n".format(
+            int(reply.store_stats.object_store_bytes_fallback / (1024 * 1024))
+        )
+    if reply.store_stats.spill_time_total_s > 0:
+        store_summary += (
+            "Spilled {} MiB, {} objects, avg write throughput {} MiB/s\n".format(
+                int(reply.store_stats.spilled_bytes_total / (1024 * 1024)),
+                reply.store_stats.spilled_objects_total,
+                int(
+                    reply.store_stats.spilled_bytes_total
+                    / (1024 * 1024)
+                    / reply.store_stats.spill_time_total_s
+                ),
+            )
+        )
+    if reply.store_stats.restore_time_total_s > 0:
+        store_summary += (
+            "Restored {} MiB, {} objects, avg read throughput {} MiB/s\n".format(
+                int(reply.store_stats.restored_bytes_total / (1024 * 1024)),
+                reply.store_stats.restored_objects_total,
+                int(
+                    reply.store_stats.restored_bytes_total
+                    / (1024 * 1024)
+                    / reply.store_stats.restore_time_total_s
+                ),
+            )
+        )
+    if reply.store_stats.consumed_bytes > 0:
+        store_summary += "Objects consumed by Ray tasks: {} MiB.\n".format(
+            int(reply.store_stats.consumed_bytes / (1024 * 1024))
+        )
+    if reply.store_stats.object_pulls_queued:
+        store_summary += "Object fetches queued, waiting for available memory."
+    return store_summary
+def free(object_refs: list, local_only: bool = False):
+    """Free a list of IDs from the in-process and plasma object stores.
+    This function is a low-level API which should be used in restricted
+    scenarios.
+    If local_only is false, the request will be send to all object stores.
+    This method will not return any value to indicate whether the deletion is
+    successful or not. This function is an instruction to the object store. If
+    some of the objects are in use, the object stores will delete them later
+    when the ref count is down to 0.
+    Examples:
+        .. testcode::
+            import ray
+            @ray.remote
+            def f():
+                return 0
+            obj_ref = f.remote()
+            ray.get(obj_ref)  # wait for object to be created first
+            free([obj_ref])  # unpin & delete object globally
+    Args:
+        object_refs (List[ObjectRef]): List of object refs to delete.
+        local_only: Whether only deleting the list of objects in local
+            object store or all object stores.
+    """
+    worker = ray._private.worker.global_worker
+    if isinstance(object_refs, ray.ObjectRef):
+        object_refs = [object_refs]
+    if not isinstance(object_refs, list):
+        raise TypeError(
+            "free() expects a list of ObjectRef, got {}".format(type(object_refs))
+        )
+    # Make sure that the values are object refs.
+    for object_ref in object_refs:
+        if not isinstance(object_ref, ray.ObjectRef):
+            raise TypeError(
+                "Attempting to call `free` on the value {}, "
+                "which is not an ray.ObjectRef.".format(object_ref)
+            )
+    worker.check_connected()
+    with profiling.profile("ray.free"):
+        if len(object_refs) == 0:
+            return
+        worker.core_worker.free_objects(object_refs, local_only)
+def get_local_ongoing_lineage_reconstruction_tasks() -> List[
+    Tuple[common_pb2.LineageReconstructionTask, int]
+]:
+    """Return the locally submitted ongoing retry tasks
+       triggered by lineage reconstruction.
+    NOTE: for the lineage reconstruction task status,
+    this method only returns the status known to the submitter
+    (i.e. it returns SUBMITTED_TO_WORKER instead of RUNNING).
+    The return type is a list of pairs where pair.first is the
+    lineage reconstruction task info and pair.second is the number
+    of ongoing lineage reconstruction tasks of this type.
+    """
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+    return worker.core_worker.get_local_ongoing_lineage_reconstruction_tasks()

.venv/lib/python3.11/site-packages/ray/_private/log.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import logging
+import threading
+from typing import Union
+import time
+INTERNAL_TIMESTAMP_LOG_KEY = "_ray_timestamp_ns"
+def _print_loggers():
+    """Print a formatted list of loggers and their handlers for debugging."""
+    loggers = {logging.root.name: logging.root}
+    loggers.update(dict(sorted(logging.root.manager.loggerDict.items())))
+    for name, logger in loggers.items():
+        if isinstance(logger, logging.Logger):
+            print(f"  {name}: disabled={logger.disabled}, propagate={logger.propagate}")
+            for handler in logger.handlers:
+                print(f"    {handler}")
+def clear_logger(logger: Union[str, logging.Logger]):
+    """Reset a logger, clearing its handlers and enabling propagation.
+    Args:
+        logger: Logger to be cleared
+    """
+    if isinstance(logger, str):
+        logger = logging.getLogger(logger)
+    logger.propagate = True
+    logger.handlers.clear()
+class PlainRayHandler(logging.StreamHandler):
+    """A plain log handler.
+    This handler writes to whatever sys.stderr points to at emit-time,
+    not at instantiation time. See docs for logging._StderrHandler.
+    """
+    def __init__(self):
+        super().__init__()
+        self.plain_handler = logging._StderrHandler()
+        self.plain_handler.level = self.level
+        self.plain_handler.formatter = logging.Formatter(fmt="%(message)s")
+    def emit(self, record: logging.LogRecord):
+        """Emit the log message.
+        If this is a worker, bypass fancy logging and just emit the log record.
+        If this is the driver, emit the message using the appropriate console handler.
+        Args:
+            record: Log record to be emitted
+        """
+        import ray
+        if (
+            hasattr(ray, "_private")
+            and hasattr(ray._private, "worker")
+            and ray._private.worker.global_worker.mode
+            == ray._private.worker.WORKER_MODE
+        ):
+            self.plain_handler.emit(record)
+        else:
+            logging._StderrHandler.emit(self, record)
+logger_initialized = False
+logging_config_lock = threading.Lock()
+def _setup_log_record_factory():
+    """Setup log record factory to add _ray_timestamp_ns to LogRecord."""
+    old_factory = logging.getLogRecordFactory()
+    def record_factory(*args, **kwargs):
+        record = old_factory(*args, **kwargs)
+        # Python logging module starts to use `time.time_ns()` to generate `created`
+        # from Python 3.13 to avoid the precision loss caused by the float type.
+        # Here, we generate the `created` for the LogRecord to support older Python
+        # versions.
+        ct = time.time_ns()
+        record.created = ct / 1e9
+        record.__dict__[INTERNAL_TIMESTAMP_LOG_KEY] = ct
+        return record
+    logging.setLogRecordFactory(record_factory)
+def generate_logging_config():
+    """Generate the default Ray logging configuration."""
+    with logging_config_lock:
+        global logger_initialized
+        if logger_initialized:
+            return
+        logger_initialized = True
+        plain_formatter = logging.Formatter(
+            "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
+        )
+        default_handler = PlainRayHandler()
+        default_handler.setFormatter(plain_formatter)
+        ray_logger = logging.getLogger("ray")
+        ray_logger.setLevel(logging.INFO)
+        ray_logger.addHandler(default_handler)
+        ray_logger.propagate = False
+        # Special handling for ray.rllib: only warning-level messages passed through
+        # See https://github.com/ray-project/ray/pull/31858 for related PR
+        rllib_logger = logging.getLogger("ray.rllib")
+        rllib_logger.setLevel(logging.WARN)
+        # Set up the LogRecord factory.
+        _setup_log_record_factory()

.venv/lib/python3.11/site-packages/ray/_private/log_monitor.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import argparse
+import errno
+import glob
+import logging
+import logging.handlers
+import os
+import platform
+import re
+import shutil
+import time
+import traceback
+from typing import Callable, List, Optional, Set
+from ray._raylet import GcsClient
+import ray._private.ray_constants as ray_constants
+import ray._private.services as services
+import ray._private.utils
+from ray._private.ray_logging import setup_component_logger
+# Logger for this module. It should be configured at the entry point
+# into the program using Ray. Ray provides a default configuration at
+# entry/init points.
+logger = logging.getLogger(__name__)
+# The groups are job id, and pid.
+WORKER_LOG_PATTERN = re.compile(".*worker.*-([0-9a-f]+)-(\d+)")
+# The groups are job id.
+RUNTIME_ENV_SETUP_PATTERN = re.compile(".*runtime_env_setup-(\d+).log")
+# Log name update interval under pressure.
+# We need it because log name update is CPU intensive and uses 100%
+# of cpu when there are many log files.
+LOG_NAME_UPDATE_INTERVAL_S = float(os.getenv("LOG_NAME_UPDATE_INTERVAL_S", 0.5))
+# Once there are more files than this threshold,
+# log monitor start giving backpressure to lower cpu usages.
+RAY_LOG_MONITOR_MANY_FILES_THRESHOLD = int(
+    os.getenv("RAY_LOG_MONITOR_MANY_FILES_THRESHOLD", 1000)
+)
+RAY_RUNTIME_ENV_LOG_TO_DRIVER_ENABLED = int(
+    os.getenv("RAY_RUNTIME_ENV_LOG_TO_DRIVER_ENABLED", 0)
+)
+class LogFileInfo:
+    def __init__(
+        self,
+        filename=None,
+        size_when_last_opened=None,
+        file_position=None,
+        file_handle=None,
+        is_err_file=False,
+        job_id=None,
+        worker_pid=None,
+    ):
+        assert (
+            filename is not None
+            and size_when_last_opened is not None
+            and file_position is not None
+        )
+        self.filename = filename
+        self.size_when_last_opened = size_when_last_opened
+        self.file_position = file_position
+        self.file_handle = file_handle
+        self.is_err_file = is_err_file
+        self.job_id = job_id
+        self.worker_pid = worker_pid
+        self.actor_name = None
+        self.task_name = None
+    def reopen_if_necessary(self):
+        """Check if the file's inode has changed and reopen it if necessary.
+        There are a variety of reasons what we would logically consider a file
+        would have different inodes, such as log rotation or file syncing
+        semantics.
+        """
+        try:
+            open_inode = None
+            if self.file_handle and not self.file_handle.closed:
+                open_inode = os.fstat(self.file_handle.fileno()).st_ino
+            new_inode = os.stat(self.filename).st_ino
+            if open_inode != new_inode:
+                self.file_handle = open(self.filename, "rb")
+                self.file_handle.seek(self.file_position)
+        except Exception:
+            logger.debug(f"file no longer exists, skip re-opening of {self.filename}")
+    def __repr__(self):
+        return (
+            "FileInfo(\n"
+            f"\tfilename: {self.filename}\n"
+            f"\tsize_when_last_opened: {self.size_when_last_opened}\n"
+            f"\tfile_position: {self.file_position}\n"
+            f"\tfile_handle: {self.file_handle}\n"
+            f"\tis_err_file: {self.is_err_file}\n"
+            f"\tjob_id: {self.job_id}\n"
+            f"\tworker_pid: {self.worker_pid}\n"
+            f"\tactor_name: {self.actor_name}\n"
+            f"\ttask_name: {self.task_name}\n"
+            ")"
+        )
+class LogMonitor:
+    """A monitor process for monitoring Ray log files.
+    This class maintains a list of open files and a list of closed log files. We
+    can't simply leave all files open because we'll run out of file
+    descriptors.
+    The "run" method of this class will cycle between doing several things:
+    1. First, it will check if any new files have appeared in the log
+       directory. If so, they will be added to the list of closed files.
+    2. Then, if we are unable to open any new files, we will close all of the
+       files.
+    3. Then, we will open as many closed files as we can that may have new
+       lines (judged by an increase in file size since the last time the file
+       was opened).
+    4. Then we will loop through the open files and see if there are any new
+       lines in the file. If so, we will publish them to Ray pubsub.
+    Attributes:
+        ip: The hostname of this machine, for grouping log messages.
+        logs_dir: The directory that the log files are in.
+        log_filenames: This is the set of filenames of all files in
+            open_file_infos and closed_file_infos.
+        open_file_infos (list[LogFileInfo]): Info for all of the open files.
+        closed_file_infos (list[LogFileInfo]): Info for all of the closed
+            files.
+        can_open_more_files: True if we can still open more files and
+            false otherwise.
+        max_files_open: The maximum number of files that can be open.
+    """
+    def __init__(
+        self,
+        node_ip_address: str,
+        logs_dir: str,
+        gcs_publisher: ray._raylet.GcsPublisher,
+        is_proc_alive_fn: Callable[[int], bool],
+        max_files_open: int = ray_constants.LOG_MONITOR_MAX_OPEN_FILES,
+        gcs_address: Optional[str] = None,
+    ):
+        """Initialize the log monitor object."""
+        self.ip: str = node_ip_address
+        self.logs_dir: str = logs_dir
+        self.publisher = gcs_publisher
+        self.log_filenames: Set[str] = set()
+        self.open_file_infos: List[LogFileInfo] = []
+        self.closed_file_infos: List[LogFileInfo] = []
+        self.can_open_more_files: bool = True
+        self.max_files_open: int = max_files_open
+        self.is_proc_alive_fn: Callable[[int], bool] = is_proc_alive_fn
+        self.is_autoscaler_v2: bool = self.get_is_autoscaler_v2(gcs_address)
+        logger.info(
+            f"Starting log monitor with [max open files={max_files_open}],"
+            f" [is_autoscaler_v2={self.is_autoscaler_v2}]"
+        )
+    def get_is_autoscaler_v2(self, gcs_address: Optional[str]) -> bool:
+        """Check if autoscaler v2 is enabled."""
+        if gcs_address is None:
+            return False
+        if not ray.experimental.internal_kv._internal_kv_initialized():
+            gcs_client = GcsClient(address=gcs_address)
+            ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
+        from ray.autoscaler.v2.utils import is_autoscaler_v2
+        return is_autoscaler_v2()
+    def _close_all_files(self):
+        """Close all open files (so that we can open more)."""
+        while len(self.open_file_infos) > 0:
+            file_info = self.open_file_infos.pop(0)
+            file_info.file_handle.close()
+            file_info.file_handle = None
+            proc_alive = True
+            # Test if the worker process that generated the log file
+            # is still alive. Only applies to worker processes.
+            # For all other system components, we always assume they are alive.
+            if (
+                file_info.worker_pid != "raylet"
+                and file_info.worker_pid != "gcs_server"
+                and file_info.worker_pid != "autoscaler"
+                and file_info.worker_pid != "runtime_env"
+                and file_info.worker_pid is not None
+            ):
+                assert not isinstance(file_info.worker_pid, str), (
+                    "PID should be an int type. " f"Given PID: {file_info.worker_pid}."
+                )
+                proc_alive = self.is_proc_alive_fn(file_info.worker_pid)
+                if not proc_alive:
+                    # The process is not alive any more, so move the log file
+                    # out of the log directory so glob.glob will not be slowed
+                    # by it.
+                    target = os.path.join(
+                        self.logs_dir, "old", os.path.basename(file_info.filename)
+                    )
+                    try:
+                        shutil.move(file_info.filename, target)
+                    except (IOError, OSError) as e:
+                        if e.errno == errno.ENOENT:
+                            logger.warning(
+                                f"Warning: The file {file_info.filename} was not found."
+                            )
+                        else:
+                            raise e
+            if proc_alive:
+                self.closed_file_infos.append(file_info)
+        self.can_open_more_files = True
+    def update_log_filenames(self):
+        """Update the list of log files to monitor."""
+        monitor_log_paths = []
+        # output of user code is written here
+        monitor_log_paths += glob.glob(
+            f"{self.logs_dir}/worker*[.out|.err]"
+        ) + glob.glob(f"{self.logs_dir}/java-worker*.log")
+        # segfaults and other serious errors are logged here
+        monitor_log_paths += glob.glob(f"{self.logs_dir}/raylet*.err")
+        # monitor logs are needed to report autoscaler events
+        # TODO(rickyx): remove this after migration.
+        if not self.is_autoscaler_v2:
+            # We publish monitor logs in autoscaler v1
+            monitor_log_paths += glob.glob(f"{self.logs_dir}/monitor.log")
+        else:
+            # We publish autoscaler events directly in autoscaler v2
+            monitor_log_paths += glob.glob(
+                f"{self.logs_dir}/events/event_AUTOSCALER.log"
+            )
+        # If gcs server restarts, there can be multiple log files.
+        monitor_log_paths += glob.glob(f"{self.logs_dir}/gcs_server*.err")
+        # runtime_env setup process is logged here
+        if RAY_RUNTIME_ENV_LOG_TO_DRIVER_ENABLED:
+            monitor_log_paths += glob.glob(f"{self.logs_dir}/runtime_env*.log")
+        for file_path in monitor_log_paths:
+            if os.path.isfile(file_path) and file_path not in self.log_filenames:
+                worker_match = WORKER_LOG_PATTERN.match(file_path)
+                if worker_match:
+                    worker_pid = int(worker_match.group(2))
+                else:
+                    worker_pid = None
+                job_id = None
+                # Perform existence check first because most file will not be
+                # including runtime_env. This saves some cpu cycle.
+                if "runtime_env" in file_path:
+                    runtime_env_job_match = RUNTIME_ENV_SETUP_PATTERN.match(file_path)
+                    if runtime_env_job_match:
+                        job_id = runtime_env_job_match.group(1)
+                is_err_file = file_path.endswith("err")
+                self.log_filenames.add(file_path)
+                self.closed_file_infos.append(
+                    LogFileInfo(
+                        filename=file_path,
+                        size_when_last_opened=0,
+                        file_position=0,
+                        file_handle=None,
+                        is_err_file=is_err_file,
+                        job_id=job_id,
+                        worker_pid=worker_pid,
+                    )
+                )
+                log_filename = os.path.basename(file_path)
+                logger.info(f"Beginning to track file {log_filename}")
+    def open_closed_files(self):
+        """Open some closed files if they may have new lines.
+        Opening more files may require us to close some of the already open
+        files.
+        """
+        if not self.can_open_more_files:
+            # If we can't open any more files. Close all of the files.
+            self._close_all_files()
+        files_with_no_updates = []
+        while len(self.closed_file_infos) > 0:
+            if len(self.open_file_infos) >= self.max_files_open:
+                self.can_open_more_files = False
+                break
+            file_info = self.closed_file_infos.pop(0)
+            assert file_info.file_handle is None
+            # Get the file size to see if it has gotten bigger since we last
+            # opened it.
+            try:
+                file_size = os.path.getsize(file_info.filename)
+            except (IOError, OSError) as e:
+                # Catch "file not found" errors.
+                if e.errno == errno.ENOENT:
+                    logger.warning(
+                        f"Warning: The file {file_info.filename} was not found."
+                    )
+                    self.log_filenames.remove(file_info.filename)
+                    continue
+                raise e
+            # If some new lines have been added to this file, try to reopen the
+            # file.
+            if file_size > file_info.size_when_last_opened:
+                try:
+                    f = open(file_info.filename, "rb")
+                except (IOError, OSError) as e:
+                    if e.errno == errno.ENOENT:
+                        logger.warning(
+                            f"Warning: The file {file_info.filename} was not found."
+                        )
+                        self.log_filenames.remove(file_info.filename)
+                        continue
+                    else:
+                        raise e
+                f.seek(file_info.file_position)
+                file_info.size_when_last_opened = file_size
+                file_info.file_handle = f
+                self.open_file_infos.append(file_info)
+            else:
+                files_with_no_updates.append(file_info)
+        if len(self.open_file_infos) >= self.max_files_open:
+            self.can_open_more_files = False
+        # Add the files with no changes back to the list of closed files.
+        self.closed_file_infos += files_with_no_updates
+    def check_log_files_and_publish_updates(self):
+        """Gets updates to the log files and publishes them.
+        Returns:
+            True if anything was published and false otherwise.
+        """
+        anything_published = False
+        lines_to_publish = []
+        def flush():
+            nonlocal lines_to_publish
+            nonlocal anything_published
+            if len(lines_to_publish) > 0:
+                data = {
+                    "ip": self.ip,
+                    "pid": file_info.worker_pid,
+                    "job": file_info.job_id,
+                    "is_err": file_info.is_err_file,
+                    "lines": lines_to_publish,
+                    "actor_name": file_info.actor_name,
+                    "task_name": file_info.task_name,
+                }
+                try:
+                    self.publisher.publish_logs(data)
+                except Exception:
+                    logger.exception(f"Failed to publish log messages {data}")
+                anything_published = True
+                lines_to_publish = []
+        for file_info in self.open_file_infos:
+            assert not file_info.file_handle.closed
+            file_info.reopen_if_necessary()
+            max_num_lines_to_read = ray_constants.LOG_MONITOR_NUM_LINES_TO_READ
+            for _ in range(max_num_lines_to_read):
+                try:
+                    next_line = file_info.file_handle.readline()
+                    # Replace any characters not in UTF-8 with
+                    # a replacement character, see
+                    # https://stackoverflow.com/a/38565489/10891801
+                    next_line = next_line.decode("utf-8", "replace")
+                    if next_line == "":
+                        break
+                    next_line = next_line.rstrip("\r\n")
+                    if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
+                        flush()  # Possible change of task/actor name.
+                        file_info.actor_name = next_line.split(
+                            ray_constants.LOG_PREFIX_ACTOR_NAME, 1
+                        )[1]
+                        file_info.task_name = None
+                    elif next_line.startswith(ray_constants.LOG_PREFIX_TASK_NAME):
+                        flush()  # Possible change of task/actor name.
+                        file_info.task_name = next_line.split(
+                            ray_constants.LOG_PREFIX_TASK_NAME, 1
+                        )[1]
+                    elif next_line.startswith(ray_constants.LOG_PREFIX_JOB_ID):
+                        file_info.job_id = next_line.split(
+                            ray_constants.LOG_PREFIX_JOB_ID, 1
+                        )[1]
+                    elif next_line.startswith(
+                        "Windows fatal exception: access violation"
+                    ):
+                        # We are suppressing the
+                        # 'Windows fatal exception: access violation'
+                        # message on workers on Windows here.
+                        # As far as we know it is harmless,
+                        # but is frequently popping up if Python
+                        # functions are run inside the core
+                        # worker C extension. See the investigation in
+                        # github.com/ray-project/ray/issues/18944
+                        # Also skip the following line, which is an
+                        # empty line.
+                        file_info.file_handle.readline()
+                    else:
+                        lines_to_publish.append(next_line)
+                except Exception:
+                    logger.error(
+                        f"Error: Reading file: {file_info.filename}, "
+                        f"position: {file_info.file_info.file_handle.tell()} "
+                        "failed."
+                    )
+                    raise
+            if file_info.file_position == 0:
+                # make filename windows-agnostic
+                filename = file_info.filename.replace("\\", "/")
+                if "/raylet" in filename:
+                    file_info.worker_pid = "raylet"
+                elif "/gcs_server" in filename:
+                    file_info.worker_pid = "gcs_server"
+                elif "/monitor" in filename or "event_AUTOSCALER" in filename:
+                    file_info.worker_pid = "autoscaler"
+                elif "/runtime_env" in filename:
+                    file_info.worker_pid = "runtime_env"
+            # Record the current position in the file.
+            file_info.file_position = file_info.file_handle.tell()
+            flush()
+        return anything_published
+    def should_update_filenames(self, last_file_updated_time: float) -> bool:
+        """Return true if filenames should be updated.
+        This method is used to apply the backpressure on file updates because
+        that requires heavy glob operations which use lots of CPUs.
+        Args:
+            last_file_updated_time: The last time filenames are updated.
+        Returns:
+            True if filenames should be updated. False otherwise.
+        """
+        elapsed_seconds = float(time.time() - last_file_updated_time)
+        return (
+            len(self.log_filenames) < RAY_LOG_MONITOR_MANY_FILES_THRESHOLD
+            or elapsed_seconds > LOG_NAME_UPDATE_INTERVAL_S
+        )
+    def run(self):
+        """Run the log monitor.
+        This will scan the file system once every LOG_NAME_UPDATE_INTERVAL_S to
+        check if there are new log files to monitor. It will also publish new
+        log lines.
+        """
+        last_updated = time.time()
+        while True:
+            if self.should_update_filenames(last_updated):
+                self.update_log_filenames()
+                last_updated = time.time()
+            self.open_closed_files()
+            anything_published = self.check_log_files_and_publish_updates()
+            # If nothing was published, then wait a little bit before checking
+            # for logs to avoid using too much CPU.
+            if not anything_published:
+                time.sleep(0.1)
+def is_proc_alive(pid):
+    # Import locally to make sure the bundled version is used if needed
+    import psutil
+    try:
+        return psutil.Process(pid).is_running()
+    except psutil.NoSuchProcess:
+        # The process does not exist.
+        return False
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=("Parse GCS server address for the log monitor to connect to.")
+    )
+    parser.add_argument(
+        "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
+    )
+    parser.add_argument(
+        "--logging-level",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_LEVEL,
+        choices=ray_constants.LOGGER_LEVEL_CHOICES,
+        help=ray_constants.LOGGER_LEVEL_HELP,
+    )
+    parser.add_argument(
+        "--logging-format",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_FORMAT,
+        help=ray_constants.LOGGER_FORMAT_HELP,
+    )
+    parser.add_argument(
+        "--logging-filename",
+        required=False,
+        type=str,
+        default=ray_constants.LOG_MONITOR_LOG_FILE_NAME,
+        help="Specify the name of log file, "
+        "log to stdout if set empty, default is "
+        f'"{ray_constants.LOG_MONITOR_LOG_FILE_NAME}"',
+    )
+    parser.add_argument(
+        "--session-dir",
+        required=True,
+        type=str,
+        help="Specify the path of the session directory used by Ray processes.",
+    )
+    parser.add_argument(
+        "--logs-dir",
+        required=True,
+        type=str,
+        help="Specify the path of the log directory used by Ray processes.",
+    )
+    parser.add_argument(
+        "--logging-rotate-bytes",
+        required=False,
+        type=int,
+        default=ray_constants.LOGGING_ROTATE_BYTES,
+        help="Specify the max bytes for rotating "
+        "log file, default is "
+        f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
+    )
+    parser.add_argument(
+        "--logging-rotate-backup-count",
+        required=False,
+        type=int,
+        default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
+        help="Specify the backup count of rotated log file, default is "
+        f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
+    )
+    args = parser.parse_args()
+    setup_component_logger(
+        logging_level=args.logging_level,
+        logging_format=args.logging_format,
+        log_dir=args.logs_dir,
+        filename=args.logging_filename,
+        max_bytes=args.logging_rotate_bytes,
+        backup_count=args.logging_rotate_backup_count,
+    )
+    node_ip = services.get_cached_node_ip_address(args.session_dir)
+    log_monitor = LogMonitor(
+        node_ip,
+        args.logs_dir,
+        ray._raylet.GcsPublisher(address=args.gcs_address),
+        is_proc_alive,
+        gcs_address=args.gcs_address,
+    )
+    try:
+        log_monitor.run()
+    except Exception as e:
+        # Something went wrong, so push an error to all drivers.
+        gcs_publisher = ray._raylet.GcsPublisher(address=args.gcs_address)
+        traceback_str = ray._private.utils.format_error_message(traceback.format_exc())
+        message = (
+            f"The log monitor on node {platform.node()} "
+            f"failed with the following error:\n{traceback_str}"
+        )
+        ray._private.utils.publish_error_to_driver(
+            ray_constants.LOG_MONITOR_DIED_ERROR,
+            message,
+            gcs_publisher=gcs_publisher,
+        )
+        logger.error(message)
+        raise e

.venv/lib/python3.11/site-packages/ray/_private/logging_utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from ray.core.generated.logging_pb2 import LogBatch
+def log_batch_dict_to_proto(log_json: dict) -> LogBatch:
+    """Converts a dict containing a batch of logs to a LogBatch proto."""
+    return LogBatch(
+        ip=log_json.get("ip"),
+        # Cast to support string pid like "gcs".
+        pid=str(log_json.get("pid")) if log_json.get("pid") else None,
+        # Job ID as a hex string.
+        job_id=log_json.get("job"),
+        is_error=bool(log_json.get("is_err")),
+        lines=log_json.get("lines"),
+        actor_name=log_json.get("actor_name"),
+        task_name=log_json.get("task_name"),
+    )
+def log_batch_proto_to_dict(log_batch: LogBatch) -> dict:
+    """Converts a LogBatch proto to a dict containing a batch of logs."""
+    return {
+        "ip": log_batch.ip,
+        "pid": log_batch.pid,
+        "job": log_batch.job_id,
+        "is_err": log_batch.is_error,
+        "lines": log_batch.lines,
+        "actor_name": log_batch.actor_name,
+        "task_name": log_batch.task_name,
+    }

.venv/lib/python3.11/site-packages/ray/_private/memory_monitor.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import logging
+import os
+import platform
+import sys
+import time
+# Import ray before psutil will make sure we use psutil's bundled version
+import ray  # noqa F401
+import psutil  # noqa E402
+logger = logging.getLogger(__name__)
+def get_rss(memory_info):
+    """Get the estimated non-shared memory usage from psutil memory_info."""
+    mem = memory_info.rss
+    # OSX doesn't have the shared attribute
+    if hasattr(memory_info, "shared"):
+        mem -= memory_info.shared
+    return mem
+def get_shared(virtual_memory):
+    """Get the estimated shared memory usage from psutil virtual mem info."""
+    # OSX doesn't have the shared attribute
+    if hasattr(virtual_memory, "shared"):
+        return virtual_memory.shared
+    else:
+        return 0
+def get_top_n_memory_usage(n: int = 10):
+    """Get the top n memory usage of the process
+    Params:
+        n: Number of top n process memory usage to return.
+    Returns:
+        (str) The formatted string of top n process memory usage.
+    """
+    pids = psutil.pids()
+    proc_stats = []
+    for pid in pids:
+        try:
+            proc = psutil.Process(pid)
+            proc_stats.append((get_rss(proc.memory_info()), pid, proc.cmdline()))
+        except psutil.NoSuchProcess:
+            # We should skip the process that has exited. Refer this
+            # issue for more detail:
+            # https://github.com/ray-project/ray/issues/14929
+            continue
+        except psutil.AccessDenied:
+            # On MacOS, the proc_pidinfo call (used to get per-process
+            # memory info) fails with a permission denied error when used
+            # on a process that isn’t owned by the same user. For now, we
+            # drop the memory info of any such process, assuming that
+            # processes owned by other users (e.g. root) aren't Ray
+            # processes and will be of less interest when an OOM happens
+            # on a Ray node.
+            # See issue for more detail:
+            # https://github.com/ray-project/ray/issues/11845#issuecomment-849904019  # noqa: E501
+            continue
+    proc_str = "PID\tMEM\tCOMMAND"
+    for rss, pid, cmdline in sorted(proc_stats, reverse=True)[:n]:
+        proc_str += "\n{}\t{}GiB\t{}".format(
+            pid, round(rss / (1024**3), 2), " ".join(cmdline)[:100].strip()
+        )
+    return proc_str
+class RayOutOfMemoryError(Exception):
+    def __init__(self, msg):
+        Exception.__init__(self, msg)
+    @staticmethod
+    def get_message(used_gb, total_gb, threshold):
+        proc_str = get_top_n_memory_usage(n=10)
+        return (
+            "More than {}% of the memory on ".format(int(100 * threshold))
+            + "node {} is used ({} / {} GB). ".format(
+                platform.node(), round(used_gb, 2), round(total_gb, 2)
+            )
+            + f"The top 10 memory consumers are:\n\n{proc_str}"
+            + "\n\nIn addition, up to {} GiB of shared memory is ".format(
+                round(get_shared(psutil.virtual_memory()) / (1024**3), 2)
+            )
+            + "currently being used by the Ray object store.\n---\n"
+            "--- Tip: Use the `ray memory` command to list active "
+            "objects in the cluster.\n"
+            "--- To disable OOM exceptions, set "
+            "RAY_DISABLE_MEMORY_MONITOR=1.\n---\n"
+        )
+class MemoryMonitor:
+    """Helper class for raising errors on low memory.
+    This presents a much cleaner error message to users than what would happen
+    if we actually ran out of memory.
+    The monitor tries to use the cgroup memory limit and usage if it is set
+    and available so that it is more reasonable inside containers. Otherwise,
+    it uses `psutil` to check the memory usage.
+    The environment variable `RAY_MEMORY_MONITOR_ERROR_THRESHOLD` can be used
+    to overwrite the default error_threshold setting.
+    Used by test only. For production code use memory_monitor.cc
+    """
+    def __init__(self, error_threshold=0.95, check_interval=1):
+        # Note: it takes ~50us to check the memory usage through psutil, so
+        # throttle this check at most once a second or so.
+        self.check_interval = check_interval
+        self.last_checked = 0
+        try:
+            self.error_threshold = float(
+                os.getenv("RAY_MEMORY_MONITOR_ERROR_THRESHOLD")
+            )
+        except (ValueError, TypeError):
+            self.error_threshold = error_threshold
+        # Try to read the cgroup memory limit if it is available.
+        try:
+            with open("/sys/fs/cgroup/memory/memory.limit_in_bytes", "rb") as f:
+                self.cgroup_memory_limit_gb = int(f.read()) / (1024**3)
+        except IOError:
+            self.cgroup_memory_limit_gb = sys.maxsize / (1024**3)
+        if not psutil:
+            logger.warn(
+                "WARNING: Not monitoring node memory since `psutil` "
+                "is not installed. Install this with "
+                "`pip install psutil` to enable "
+                "debugging of memory-related crashes."
+            )
+        self.disabled = (
+            "RAY_DEBUG_DISABLE_MEMORY_MONITOR" in os.environ
+            or "RAY_DISABLE_MEMORY_MONITOR" in os.environ
+        )
+    def get_memory_usage(self):
+        from ray._private.utils import get_system_memory, get_used_memory
+        total_gb = get_system_memory() / (1024**3)
+        used_gb = get_used_memory() / (1024**3)
+        return used_gb, total_gb
+    def raise_if_low_memory(self):
+        if self.disabled:
+            return
+        if time.time() - self.last_checked > self.check_interval:
+            self.last_checked = time.time()
+            used_gb, total_gb = self.get_memory_usage()
+            if used_gb > total_gb * self.error_threshold:
+                raise RayOutOfMemoryError(
+                    RayOutOfMemoryError.get_message(
+                        used_gb, total_gb, self.error_threshold
+                    )
+                )
+            else:
+                logger.debug(f"Memory usage is {used_gb} / {total_gb}")

.venv/lib/python3.11/site-packages/ray/_private/metrics_agent.py ADDED Viewed

	@@ -0,0 +1,675 @@

+import json
+import logging
+import os
+import re
+import threading
+import time
+import traceback
+from collections import namedtuple
+from typing import List, Tuple, Any, Dict, Set
+from prometheus_client.core import (
+    CounterMetricFamily,
+    GaugeMetricFamily,
+    HistogramMetricFamily,
+)
+from opencensus.metrics.export.value import ValueDouble
+from opencensus.metrics.export.metric_descriptor import MetricDescriptorType
+from opencensus.stats import aggregation
+from opencensus.stats import measure as measure_module
+from opencensus.stats.view_manager import ViewManager
+from opencensus.stats.stats_recorder import StatsRecorder
+from opencensus.stats.base_exporter import StatsExporter
+from prometheus_client.core import Metric as PrometheusMetric
+from opencensus.stats.aggregation_data import (
+    CountAggregationData,
+    DistributionAggregationData,
+    LastValueAggregationData,
+    SumAggregationData,
+)
+from opencensus.stats.view import View
+from opencensus.tags import tag_key as tag_key_module
+from opencensus.tags import tag_map as tag_map_module
+from opencensus.tags import tag_value as tag_value_module
+import ray
+from ray._raylet import GcsClient
+from ray.core.generated.metrics_pb2 import Metric
+from ray._private.ray_constants import env_bool
+logger = logging.getLogger(__name__)
+# Env var key to decide worker timeout.
+# If the worker doesn't report for more than
+# this time, we treat workers as dead.
+RAY_WORKER_TIMEOUT_S = "RAY_WORKER_TIMEOUT_S"
+GLOBAL_COMPONENT_KEY = "CORE"
+RE_NON_ALPHANUMS = re.compile(r"[^a-zA-Z0-9]")
+class Gauge(View):
+    """Gauge representation of opencensus view.
+    This class is used to collect process metrics from the reporter agent.
+    Cpp metrics should be collected in a different way.
+    """
+    def __init__(self, name, description, unit, tags: List[str]):
+        self._measure = measure_module.MeasureInt(name, description, unit)
+        tags = [tag_key_module.TagKey(tag) for tag in tags]
+        self._view = View(
+            name, description, tags, self.measure, aggregation.LastValueAggregation()
+        )
+    @property
+    def measure(self):
+        return self._measure
+    @property
+    def view(self):
+        return self._view
+    @property
+    def name(self):
+        return self.measure.name
+Record = namedtuple("Record", ["gauge", "value", "tags"])
+def fix_grpc_metric(metric: Metric):
+    """
+    Fix the inbound `opencensus.proto.metrics.v1.Metric` protos to make it acceptable
+    by opencensus.stats.DistributionAggregationData.
+    - metric name: gRPC OpenCensus metrics have names with slashes and dots, e.g.
+    `grpc.io/client/server_latency`[1]. However Prometheus metric names only take
+    alphanums,underscores and colons[2]. We santinize the name by replacing non-alphanum
+    chars to underscore, like the official opencensus prometheus exporter[3].
+    - distribution bucket bounds: The Metric proto asks distribution bucket bounds to
+    be > 0 [4]. However, gRPC OpenCensus metrics have their first bucket bound == 0 [1].
+    This makes the `DistributionAggregationData` constructor to raise Exceptions. This
+    applies to all bytes and milliseconds (latencies). The fix: we update the initial 0
+    bounds to be 0.000_000_1. This will not affect the precision of the metrics, since
+    we don't expect any less-than-1 bytes, or less-than-1-nanosecond times.
+    [1] https://github.com/census-instrumentation/opencensus-specs/blob/master/stats/gRPC.md#units  # noqa: E501
+    [2] https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels
+    [3] https://github.com/census-instrumentation/opencensus-cpp/blob/50eb5de762e5f87e206c011a4f930adb1a1775b1/opencensus/exporters/stats/prometheus/internal/prometheus_utils.cc#L39 # noqa: E501
+    [4] https://github.com/census-instrumentation/opencensus-proto/blob/master/src/opencensus/proto/metrics/v1/metrics.proto#L218 # noqa: E501
+    """
+    if not metric.metric_descriptor.name.startswith("grpc.io/"):
+        return
+    metric.metric_descriptor.name = RE_NON_ALPHANUMS.sub(
+        "_", metric.metric_descriptor.name
+    )
+    for series in metric.timeseries:
+        for point in series.points:
+            if point.HasField("distribution_value"):
+                dist_value = point.distribution_value
+                bucket_bounds = dist_value.bucket_options.explicit.bounds
+                if len(bucket_bounds) > 0 and bucket_bounds[0] == 0:
+                    bucket_bounds[0] = 0.000_000_1
+class OpencensusProxyMetric:
+    def __init__(self, name: str, desc: str, unit: str, label_keys: List[str]):
+        """Represents the OpenCensus metrics that will be proxy exported."""
+        self._name = name
+        self._desc = desc
+        self._unit = unit
+        # -- The label keys of the metric --
+        self._label_keys = label_keys
+        # -- The data that needs to be proxy exported --
+        # tuple of label values -> data (OpenCesnsus Aggregation data)
+        self._data = {}
+    @property
+    def name(self):
+        return self._name
+    @property
+    def desc(self):
+        return self._desc
+    @property
+    def unit(self):
+        return self._unit
+    @property
+    def label_keys(self):
+        return self._label_keys
+    @property
+    def data(self):
+        return self._data
+    def record(self, metric: Metric):
+        """Parse the Opencensus Protobuf and store the data.
+        The data can be accessed via `data` API once recorded.
+        """
+        timeseries = metric.timeseries
+        if len(timeseries) == 0:
+            return
+        # Create the aggregation and fill it in the our stats
+        for series in timeseries:
+            labels = tuple(val.value for val in series.label_values)
+            # Aggregate points.
+            for point in series.points:
+                if (
+                    metric.metric_descriptor.type
+                    == MetricDescriptorType.CUMULATIVE_INT64
+                ):
+                    data = CountAggregationData(point.int64_value)
+                elif (
+                    metric.metric_descriptor.type
+                    == MetricDescriptorType.CUMULATIVE_DOUBLE
+                ):
+                    data = SumAggregationData(ValueDouble, point.double_value)
+                elif metric.metric_descriptor.type == MetricDescriptorType.GAUGE_DOUBLE:
+                    data = LastValueAggregationData(ValueDouble, point.double_value)
+                elif (
+                    metric.metric_descriptor.type
+                    == MetricDescriptorType.CUMULATIVE_DISTRIBUTION
+                ):
+                    dist_value = point.distribution_value
+                    counts_per_bucket = [bucket.count for bucket in dist_value.buckets]
+                    bucket_bounds = dist_value.bucket_options.explicit.bounds
+                    data = DistributionAggregationData(
+                        dist_value.sum / dist_value.count,
+                        dist_value.count,
+                        dist_value.sum_of_squared_deviation,
+                        counts_per_bucket,
+                        bucket_bounds,
+                    )
+                else:
+                    raise ValueError("Summary is not supported")
+                self._data[labels] = data
+class Component:
+    def __init__(self, id: str):
+        """Represent a component that requests to proxy export metrics
+        Args:
+            id: Id of this component.
+        """
+        self.id = id
+        # -- The time this component reported its metrics last time --
+        # It is used to figure out if this component is stale.
+        self._last_reported_time = time.monotonic()
+        # -- Metrics requested to proxy export from this component --
+        # metrics_name (str) -> metric (OpencensusProxyMetric)
+        self._metrics = {}
+    @property
+    def metrics(self) -> Dict[str, OpencensusProxyMetric]:
+        """Return the metrics requested to proxy export from this component."""
+        return self._metrics
+    @property
+    def last_reported_time(self):
+        return self._last_reported_time
+    def record(self, metrics: List[Metric]):
+        """Parse the Opencensus protobuf and store metrics.
+        Metrics can be accessed via `metrics` API for proxy export.
+        Args:
+            metrics: A list of Opencensus protobuf for proxy export.
+        """
+        self._last_reported_time = time.monotonic()
+        for metric in metrics:
+            fix_grpc_metric(metric)
+            descriptor = metric.metric_descriptor
+            name = descriptor.name
+            label_keys = [label_key.key for label_key in descriptor.label_keys]
+            if name not in self._metrics:
+                self._metrics[name] = OpencensusProxyMetric(
+                    name, descriptor.description, descriptor.unit, label_keys
+                )
+            self._metrics[name].record(metric)
+class OpenCensusProxyCollector:
+    def __init__(self, namespace: str, component_timeout_s: int = 60):
+        """Prometheus collector implementation for opencensus proxy export.
+        Prometheus collector requires to implement `collect` which is
+        invoked whenever Prometheus queries the endpoint.
+        The class is thread-safe.
+        Args:
+            namespace: Prometheus namespace.
+        """
+        # -- Protect `self._components` --
+        self._components_lock = threading.Lock()
+        # -- Timeout until the component is marked as stale --
+        # Once the component is considered as stale,
+        # the metrics from that worker won't be exported.
+        self._component_timeout_s = component_timeout_s
+        # -- Prometheus namespace --
+        self._namespace = namespace
+        # -- Component that requests to proxy export metrics --
+        # Component means core worker, raylet, and GCS.
+        # component_id -> Components
+        # For workers, they contain worker ids.
+        # For other components (raylet, GCS),
+        # they contain the global key `GLOBAL_COMPONENT_KEY`.
+        self._components = {}
+        # Whether we want to export counter as gauge.
+        # This is for bug compatibility.
+        # See https://github.com/ray-project/ray/pull/43795.
+        self._export_counter_as_gauge = env_bool("RAY_EXPORT_COUNTER_AS_GAUGE", True)
+    def record(self, metrics: List[Metric], worker_id_hex: str = None):
+        """Record the metrics reported from the component that reports it.
+        Args:
+            metrics: A list of opencensus protobuf to proxy export metrics.
+            worker_id_hex: A worker id that reports these metrics.
+                If None, it means they are reported from Raylet or GCS.
+        """
+        key = GLOBAL_COMPONENT_KEY if not worker_id_hex else worker_id_hex
+        with self._components_lock:
+            if key not in self._components:
+                self._components[key] = Component(key)
+            self._components[key].record(metrics)
+    def clean_stale_components(self):
+        """Clean up stale components.
+        Stale means the component is dead or unresponsive.
+        Stale components won't be reported to Prometheus anymore.
+        """
+        with self._components_lock:
+            stale_components = []
+            stale_component_ids = []
+            for id, component in self._components.items():
+                elapsed = time.monotonic() - component.last_reported_time
+                if elapsed > self._component_timeout_s:
+                    stale_component_ids.append(id)
+                    logger.info(
+                        "Metrics from a worker ({}) is cleaned up due to "
+                        "timeout. Time since last report {}s".format(id, elapsed)
+                    )
+            for id in stale_component_ids:
+                stale_components.append(self._components.pop(id))
+            return stale_components
+    # TODO(sang): add start and end timestamp
+    def to_metrics(
+        self,
+        metric_name: str,
+        metric_description: str,
+        label_keys: List[str],
+        metric_units: str,
+        label_values: Tuple[tag_value_module.TagValue],
+        agg_data: Any,
+        metrics_map: Dict[str, List[PrometheusMetric]],
+    ):
+        """to_metric translate the data that OpenCensus create
+        to Prometheus format, using Prometheus Metric object.
+        This method is from Opencensus Prometheus Exporter.
+        Args:
+            metric_name: Name of the metric.
+            metric_description: Description of the metric.
+            label_keys: The fixed label keys of the metric.
+            metric_units: Units of the metric.
+            label_values: The values of `label_keys`.
+            agg_data: `opencensus.stats.aggregation_data.AggregationData` object.
+                Aggregated data that needs to be converted as Prometheus samples
+            metrics_map: The converted metric is added to this map.
+        """
+        assert self._components_lock.locked()
+        metric_name = f"{self._namespace}_{metric_name}"
+        assert len(label_values) == len(label_keys), (label_values, label_keys)
+        # Prometheus requires that all tag values be strings hence
+        # the need to cast none to the empty string before exporting. See
+        # https://github.com/census-instrumentation/opencensus-python/issues/480
+        label_values = [tv if tv else "" for tv in label_values]
+        if isinstance(agg_data, CountAggregationData):
+            metrics = metrics_map.get(metric_name)
+            if not metrics:
+                metric = CounterMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    unit=metric_units,
+                    labels=label_keys,
+                )
+                metrics = [metric]
+                metrics_map[metric_name] = metrics
+            metrics[0].add_metric(labels=label_values, value=agg_data.count_data)
+            return
+        if isinstance(agg_data, SumAggregationData):
+            # This should be emitted as prometheus counter
+            # but we used to emit it as prometheus gauge.
+            # To keep the backward compatibility
+            # (changing from counter to gauge changes the metric name
+            # since prometheus client will add "_total" suffix to counter
+            # per OpenMetrics specification),
+            # we now emit both counter and gauge and in the
+            # next major Ray release (3.0) we can stop emitting gauge.
+            # This leaves people enough time to migrate their dashboards.
+            # See https://github.com/ray-project/ray/pull/43795.
+            metrics = metrics_map.get(metric_name)
+            if not metrics:
+                metric = CounterMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    labels=label_keys,
+                )
+                metrics = [metric]
+                metrics_map[metric_name] = metrics
+            metrics[0].add_metric(labels=label_values, value=agg_data.sum_data)
+            if not self._export_counter_as_gauge:
+                pass
+            elif metric_name.endswith("_total"):
+                # In this case, we only need to emit prometheus counter
+                # since for metric name already ends with _total suffix
+                # prometheus client won't change it
+                # so there is no backward compatibility issue.
+                # See https://prometheus.github.io/client_python/instrumenting/counter/
+                pass
+            else:
+                if len(metrics) == 1:
+                    metric = GaugeMetricFamily(
+                        name=metric_name,
+                        documentation=(
+                            f"(DEPRECATED, use {metric_name}_total metric instead) "
+                            f"{metric_description}"
+                        ),
+                        labels=label_keys,
+                    )
+                    metrics.append(metric)
+                assert len(metrics) == 2
+                metrics[1].add_metric(labels=label_values, value=agg_data.sum_data)
+            return
+        elif isinstance(agg_data, DistributionAggregationData):
+            assert agg_data.bounds == sorted(agg_data.bounds)
+            # buckets are a list of buckets. Each bucket is another list with
+            # a pair of bucket name and value, or a triple of bucket name,
+            # value, and exemplar. buckets need to be in order.
+            buckets = []
+            cum_count = 0  # Prometheus buckets expect cumulative count.
+            for ii, bound in enumerate(agg_data.bounds):
+                cum_count += agg_data.counts_per_bucket[ii]
+                bucket = [str(bound), cum_count]
+                buckets.append(bucket)
+            # Prometheus requires buckets to be sorted, and +Inf present.
+            # In OpenCensus we don't have +Inf in the bucket bonds so need to
+            # append it here.
+            buckets.append(["+Inf", agg_data.count_data])
+            metrics = metrics_map.get(metric_name)
+            if not metrics:
+                metric = HistogramMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    labels=label_keys,
+                )
+                metrics = [metric]
+                metrics_map[metric_name] = metrics
+            metrics[0].add_metric(
+                labels=label_values,
+                buckets=buckets,
+                sum_value=agg_data.sum,
+            )
+            return
+        elif isinstance(agg_data, LastValueAggregationData):
+            metrics = metrics_map.get(metric_name)
+            if not metrics:
+                metric = GaugeMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    labels=label_keys,
+                )
+                metrics = [metric]
+                metrics_map[metric_name] = metrics
+            metrics[0].add_metric(labels=label_values, value=agg_data.value)
+            return
+        else:
+            raise ValueError(f"unsupported aggregation type {type(agg_data)}")
+    def collect(self):  # pragma: NO COVER
+        """Collect fetches the statistics from OpenCensus
+        and delivers them as Prometheus Metrics.
+        Collect is invoked every time a prometheus.Gatherer is run
+        for example when the HTTP endpoint is invoked by Prometheus.
+        This method is required as a Prometheus Collector.
+        """
+        with self._components_lock:
+            metrics_map = {}
+            for component in self._components.values():
+                for metric in component.metrics.values():
+                    for label_values, data in metric.data.items():
+                        self.to_metrics(
+                            metric.name,
+                            metric.desc,
+                            metric.label_keys,
+                            metric.unit,
+                            label_values,
+                            data,
+                            metrics_map,
+                        )
+        for metrics in metrics_map.values():
+            for metric in metrics:
+                yield metric
+class MetricsAgent:
+    def __init__(
+        self,
+        view_manager: ViewManager,
+        stats_recorder: StatsRecorder,
+        stats_exporter: StatsExporter = None,
+    ):
+        """A class to record and export metrics.
+        The class exports metrics in 2 different ways.
+        - Directly record and export metrics using OpenCensus.
+        - Proxy metrics from other core components
+            (e.g., raylet, GCS, core workers).
+        This class is thread-safe.
+        """
+        # Lock required because gRPC server uses
+        # multiple threads to process requests.
+        self._lock = threading.Lock()
+        #
+        # Opencensus components to record metrics.
+        #
+        # Managing views to export metrics
+        # If the stats_exporter is None, we disable all metrics export.
+        self.view_manager = view_manager
+        # A class that's used to record metrics
+        # emitted from the current process.
+        self.stats_recorder = stats_recorder
+        # A class to export metrics.
+        self.stats_exporter = stats_exporter
+        # -- A Prometheus custom collector to proxy export metrics --
+        # `None` if the prometheus server is not started.
+        self.proxy_exporter_collector = None
+        if self.stats_exporter is None:
+            # If the exporter is not given,
+            # we disable metrics collection.
+            self.view_manager = None
+        else:
+            self.view_manager.register_exporter(stats_exporter)
+            self.proxy_exporter_collector = OpenCensusProxyCollector(
+                self.stats_exporter.options.namespace,
+                component_timeout_s=int(os.getenv(RAY_WORKER_TIMEOUT_S, 120)),
+            )
+        # Registered view names.
+        self._registered_views: Set[str] = set()
+    def record_and_export(self, records: List[Record], global_tags=None):
+        """Directly record and export stats from the same process."""
+        global_tags = global_tags or {}
+        with self._lock:
+            if not self.view_manager:
+                return
+            for record in records:
+                gauge = record.gauge
+                value = record.value
+                tags = record.tags
+                self._record_gauge(gauge, value, {**tags, **global_tags})
+    def _record_gauge(self, gauge: Gauge, value: float, tags: dict):
+        if gauge.name not in self._registered_views:
+            self.view_manager.register_view(gauge.view)
+            self._registered_views.add(gauge.name)
+        measurement_map = self.stats_recorder.new_measurement_map()
+        tag_map = tag_map_module.TagMap()
+        for key, tag_val in tags.items():
+            tag_key = tag_key_module.TagKey(key)
+            tag_value = tag_value_module.TagValue(tag_val)
+            tag_map.insert(tag_key, tag_value)
+        measurement_map.measure_float_put(gauge.measure, value)
+        # NOTE: When we record this metric, timestamp will be renewed.
+        measurement_map.record(tag_map)
+    def proxy_export_metrics(self, metrics: List[Metric], worker_id_hex: str = None):
+        """Proxy export metrics specified by a Opencensus Protobuf.
+        This API is used to export metrics emitted from
+        core components.
+        Args:
+            metrics: A list of protobuf Metric defined from OpenCensus.
+            worker_id_hex: The worker ID it proxies metrics export. None
+                if the metric is not from a worker (i.e., raylet, GCS).
+        """
+        with self._lock:
+            if not self.view_manager:
+                return
+        self._proxy_export_metrics(metrics, worker_id_hex)
+    def _proxy_export_metrics(self, metrics: List[Metric], worker_id_hex: str = None):
+        self.proxy_exporter_collector.record(metrics, worker_id_hex)
+    def clean_all_dead_worker_metrics(self):
+        """Clean dead worker's metrics.
+        Worker metrics are cleaned up and won't be exported once
+        it is considered as dead.
+        This method has to be periodically called by a caller.
+        """
+        with self._lock:
+            if not self.view_manager:
+                return
+        self.proxy_exporter_collector.clean_stale_components()
+class PrometheusServiceDiscoveryWriter(threading.Thread):
+    """A class to support Prometheus service discovery.
+    It supports file-based service discovery. Checkout
+    https://prometheus.io/docs/guides/file-sd/ for more details.
+    Args:
+        gcs_address: Gcs address for this cluster.
+        temp_dir: Temporary directory used by
+            Ray to store logs and metadata.
+    """
+    def __init__(self, gcs_address, temp_dir):
+        gcs_client_options = ray._raylet.GcsClientOptions.create(
+            gcs_address, None, allow_cluster_id_nil=True, fetch_cluster_id_if_nil=False
+        )
+        self.gcs_address = gcs_address
+        ray._private.state.state._initialize_global_state(gcs_client_options)
+        self.temp_dir = temp_dir
+        self.default_service_discovery_flush_period = 5
+        super().__init__()
+    def get_file_discovery_content(self):
+        """Return the content for Prometheus service discovery."""
+        nodes = ray.nodes()
+        metrics_export_addresses = [
+            "{}:{}".format(node["NodeManagerAddress"], node["MetricsExportPort"])
+            for node in nodes
+            if node["alive"] is True
+        ]
+        gcs_client = GcsClient(address=self.gcs_address)
+        autoscaler_addr = gcs_client.internal_kv_get(b"AutoscalerMetricsAddress", None)
+        if autoscaler_addr:
+            metrics_export_addresses.append(autoscaler_addr.decode("utf-8"))
+        dashboard_addr = gcs_client.internal_kv_get(b"DashboardMetricsAddress", None)
+        if dashboard_addr:
+            metrics_export_addresses.append(dashboard_addr.decode("utf-8"))
+        return json.dumps(
+            [{"labels": {"job": "ray"}, "targets": metrics_export_addresses}]
+        )
+    def write(self):
+        # Write a file based on https://prometheus.io/docs/guides/file-sd/
+        # Write should be atomic. Otherwise, Prometheus raises an error that
+        # json file format is invalid because it reads a file when
+        # file is re-written. Note that Prometheus still works although we
+        # have this error.
+        temp_file_name = self.get_temp_file_name()
+        with open(temp_file_name, "w") as json_file:
+            json_file.write(self.get_file_discovery_content())
+        # NOTE: os.replace is atomic on both Linux and Windows, so we won't
+        # have race condition reading this file.
+        os.replace(temp_file_name, self.get_target_file_name())
+    def get_target_file_name(self):
+        return os.path.join(
+            self.temp_dir, ray._private.ray_constants.PROMETHEUS_SERVICE_DISCOVERY_FILE
+        )
+    def get_temp_file_name(self):
+        return os.path.join(
+            self.temp_dir,
+            "{}_{}".format(
+                "tmp", ray._private.ray_constants.PROMETHEUS_SERVICE_DISCOVERY_FILE
+            ),
+        )
+    def run(self):
+        while True:
+            # This thread won't be broken by exceptions.
+            try:
+                self.write()
+            except Exception as e:
+                logger.warning(
+                    "Writing a service discovery file, {},"
+                    "failed.".format(self.get_target_file_name())
+                )
+                logger.warning(traceback.format_exc())
+                logger.warning(f"Error message: {e}")
+            time.sleep(self.default_service_discovery_flush_period)

.venv/lib/python3.11/site-packages/ray/_private/node.py ADDED Viewed

	@@ -0,0 +1,1862 @@

+import atexit
+import collections
+import datetime
+import errno
+import json
+import logging
+import os
+import random
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import traceback
+from collections import defaultdict
+from typing import Dict, Optional, Tuple, IO, AnyStr
+from filelock import FileLock
+import ray
+import ray._private.ray_constants as ray_constants
+import ray._private.services
+from ray._private import storage
+from ray._raylet import GcsClient, get_session_key_from_storage
+from ray._private.resource_spec import ResourceSpec
+from ray._private.services import serialize_config, get_address
+from ray._private.utils import open_log, try_to_create_directory, try_to_symlink
+# Logger for this module. It should be configured at the entry point
+# into the program using Ray. Ray configures it by default automatically
+# using logging.basicConfig in its entry/init points.
+logger = logging.getLogger(__name__)
+class Node:
+    """An encapsulation of the Ray processes on a single node.
+    This class is responsible for starting Ray processes and killing them,
+    and it also controls the temp file policy.
+    Attributes:
+        all_processes: A mapping from process type (str) to a list of
+            ProcessInfo objects. All lists have length one except for the Redis
+            server list, which has multiple.
+    """
+    def __init__(
+        self,
+        ray_params,
+        head: bool = False,
+        shutdown_at_exit: bool = True,
+        spawn_reaper: bool = True,
+        connect_only: bool = False,
+        default_worker: bool = False,
+        ray_init_cluster: bool = False,
+    ):
+        """Start a node.
+        Args:
+            ray_params: The RayParams to use to configure the node.
+            head: True if this is the head node, which means it will
+                start additional processes like the Redis servers, monitor
+                processes, and web UI.
+            shutdown_at_exit: If true, spawned processes will be cleaned
+                up if this process exits normally.
+            spawn_reaper: If true, spawns a process that will clean up
+                other spawned processes if this process dies unexpectedly.
+            connect_only: If true, connect to the node without starting
+                new processes.
+            default_worker: Whether it's running from a ray worker or not
+            ray_init_cluster: Whether it's a cluster created by ray.init()
+        """
+        if shutdown_at_exit:
+            if connect_only:
+                raise ValueError(
+                    "'shutdown_at_exit' and 'connect_only' cannot both be true."
+                )
+            self._register_shutdown_hooks()
+        self._default_worker = default_worker
+        self.head = head
+        self.kernel_fate_share = bool(
+            spawn_reaper and ray._private.utils.detect_fate_sharing_support()
+        )
+        self.all_processes: dict = {}
+        self.removal_lock = threading.Lock()
+        self.ray_init_cluster = ray_init_cluster
+        if ray_init_cluster:
+            assert head, "ray.init() created cluster only has the head node"
+        # Set up external Redis when `RAY_REDIS_ADDRESS` is specified.
+        redis_address_env = os.environ.get("RAY_REDIS_ADDRESS")
+        if ray_params.external_addresses is None and redis_address_env is not None:
+            external_redis = redis_address_env.split(",")
+            # Reuse primary Redis as Redis shard when there's only one
+            # instance provided.
+            if len(external_redis) == 1:
+                external_redis.append(external_redis[0])
+            [primary_redis_ip, port] = external_redis[0].rsplit(":", 1)
+            ray_params.external_addresses = external_redis
+            ray_params.num_redis_shards = len(external_redis) - 1
+        if (
+            ray_params._system_config
+            and len(ray_params._system_config) > 0
+            and (not head and not connect_only)
+        ):
+            raise ValueError(
+                "System config parameters can only be set on the head node."
+            )
+        ray_params.update_if_absent(
+            include_log_monitor=True,
+            resources={},
+            worker_path=os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "workers",
+                "default_worker.py",
+            ),
+            setup_worker_path=os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "workers",
+                ray_constants.SETUP_WORKER_FILENAME,
+            ),
+        )
+        self._resource_spec = None
+        self._localhost = socket.gethostbyname("localhost")
+        self._ray_params = ray_params
+        self._config = ray_params._system_config or {}
+        self._dashboard_agent_listen_port = ray_params.dashboard_agent_listen_port
+        self._dashboard_grpc_port = ray_params.dashboard_grpc_port
+        # Configure log rotation parameters.
+        self.max_bytes = int(
+            os.getenv("RAY_ROTATION_MAX_BYTES", ray_constants.LOGGING_ROTATE_BYTES)
+        )
+        self.backup_count = int(
+            os.getenv(
+                "RAY_ROTATION_BACKUP_COUNT", ray_constants.LOGGING_ROTATE_BACKUP_COUNT
+            )
+        )
+        assert self.max_bytes >= 0
+        assert self.backup_count >= 0
+        self._redis_address = ray_params.redis_address
+        if head:
+            ray_params.update_if_absent(num_redis_shards=1)
+        self._gcs_address = ray_params.gcs_address
+        self._gcs_client = None
+        if not self.head:
+            self.validate_ip_port(self.address)
+            self._init_gcs_client()
+        # Register the temp dir.
+        self._session_name = ray_params.session_name
+        if self._session_name is None:
+            if head:
+                # We expect this the first time we initialize a cluster, but not during
+                # subsequent restarts of the head node.
+                maybe_key = self.check_persisted_session_name()
+                if maybe_key is None:
+                    # date including microsecond
+                    date_str = datetime.datetime.today().strftime(
+                        "%Y-%m-%d_%H-%M-%S_%f"
+                    )
+                    self._session_name = f"session_{date_str}_{os.getpid()}"
+                else:
+                    self._session_name = ray._private.utils.decode(maybe_key)
+            else:
+                assert not self._default_worker
+                session_name = ray._private.utils.internal_kv_get_with_retry(
+                    self.get_gcs_client(),
+                    "session_name",
+                    ray_constants.KV_NAMESPACE_SESSION,
+                    num_retries=ray_constants.NUM_REDIS_GET_RETRIES,
+                )
+                self._session_name = ray._private.utils.decode(session_name)
+        # Initialize webui url
+        if head:
+            self._webui_url = None
+        else:
+            if ray_params.webui is None:
+                assert not self._default_worker
+                self._webui_url = ray._private.services.get_webui_url_from_internal_kv()
+            else:
+                self._webui_url = (
+                    f"{ray_params.dashboard_host}:{ray_params.dashboard_port}"
+                )
+        # It creates a session_dir.
+        self._init_temp()
+        node_ip_address = ray_params.node_ip_address
+        if node_ip_address is None:
+            if connect_only:
+                node_ip_address = self._wait_and_get_for_node_address()
+            else:
+                node_ip_address = ray.util.get_node_ip_address()
+        assert node_ip_address is not None
+        ray_params.update_if_absent(
+            node_ip_address=node_ip_address, raylet_ip_address=node_ip_address
+        )
+        self._node_ip_address = node_ip_address
+        if not connect_only:
+            ray._private.services.write_node_ip_address(
+                self.get_session_dir_path(), node_ip_address
+            )
+        if ray_params.raylet_ip_address:
+            raylet_ip_address = ray_params.raylet_ip_address
+        else:
+            raylet_ip_address = node_ip_address
+        if raylet_ip_address != node_ip_address and (not connect_only or head):
+            raise ValueError(
+                "The raylet IP address should only be different than the node "
+                "IP address when connecting to an existing raylet; i.e., when "
+                "head=False and connect_only=True."
+            )
+        self._raylet_ip_address = raylet_ip_address
+        # Validate and initialize the persistent storage API.
+        if head:
+            storage._init_storage(ray_params.storage, is_head=True)
+        else:
+            if not self._default_worker:
+                storage_uri = ray._private.services.get_storage_uri_from_internal_kv()
+            else:
+                storage_uri = ray_params.storage
+            storage._init_storage(storage_uri, is_head=False)
+        # If it is a head node, try validating if
+        # external storage is configurable.
+        if head:
+            self.validate_external_storage()
+        if connect_only:
+            # Get socket names from the configuration.
+            self._plasma_store_socket_name = ray_params.plasma_store_socket_name
+            self._raylet_socket_name = ray_params.raylet_socket_name
+            self._node_id = ray_params.node_id
+            # If user does not provide the socket name, get it from Redis.
+            if (
+                self._plasma_store_socket_name is None
+                or self._raylet_socket_name is None
+                or self._ray_params.node_manager_port is None
+                or self._node_id is None
+            ):
+                # Get the address info of the processes to connect to
+                # from Redis or GCS.
+                node_info = ray._private.services.get_node_to_connect_for_driver(
+                    self.gcs_address,
+                    self._raylet_ip_address,
+                )
+                self._plasma_store_socket_name = node_info["object_store_socket_name"]
+                self._raylet_socket_name = node_info["raylet_socket_name"]
+                self._ray_params.node_manager_port = node_info["node_manager_port"]
+                self._node_id = node_info["node_id"]
+        else:
+            # If the user specified a socket name, use it.
+            self._plasma_store_socket_name = self._prepare_socket_file(
+                self._ray_params.plasma_store_socket_name, default_prefix="plasma_store"
+            )
+            self._raylet_socket_name = self._prepare_socket_file(
+                self._ray_params.raylet_socket_name, default_prefix="raylet"
+            )
+            if (
+                self._ray_params.env_vars is not None
+                and "RAY_OVERRIDE_NODE_ID_FOR_TESTING" in self._ray_params.env_vars
+            ):
+                node_id = self._ray_params.env_vars["RAY_OVERRIDE_NODE_ID_FOR_TESTING"]
+                logger.debug(
+                    f"Setting node ID to {node_id} "
+                    "based on ray_params.env_vars override"
+                )
+                self._node_id = node_id
+            elif os.environ.get("RAY_OVERRIDE_NODE_ID_FOR_TESTING"):
+                node_id = os.environ["RAY_OVERRIDE_NODE_ID_FOR_TESTING"]
+                logger.debug(f"Setting node ID to {node_id} based on env override")
+                self._node_id = node_id
+            else:
+                node_id = ray.NodeID.from_random().hex()
+                logger.debug(f"Setting node ID to {node_id}")
+                self._node_id = node_id
+        # The dashboard agent port is assigned first to avoid
+        # other processes accidentally taking its default port
+        self._dashboard_agent_listen_port = self._get_cached_port(
+            "dashboard_agent_listen_port",
+            default_port=ray_params.dashboard_agent_listen_port,
+        )
+        self.metrics_agent_port = self._get_cached_port(
+            "metrics_agent_port", default_port=ray_params.metrics_agent_port
+        )
+        self._metrics_export_port = self._get_cached_port(
+            "metrics_export_port", default_port=ray_params.metrics_export_port
+        )
+        self._runtime_env_agent_port = self._get_cached_port(
+            "runtime_env_agent_port",
+            default_port=ray_params.runtime_env_agent_port,
+        )
+        ray_params.update_if_absent(
+            metrics_agent_port=self.metrics_agent_port,
+            metrics_export_port=self._metrics_export_port,
+            dashboard_agent_listen_port=self._dashboard_agent_listen_port,
+            runtime_env_agent_port=self._runtime_env_agent_port,
+        )
+        # Pick a GCS server port.
+        if head:
+            gcs_server_port = os.getenv(ray_constants.GCS_PORT_ENVIRONMENT_VARIABLE)
+            if gcs_server_port:
+                ray_params.update_if_absent(gcs_server_port=int(gcs_server_port))
+            if ray_params.gcs_server_port is None or ray_params.gcs_server_port == 0:
+                ray_params.gcs_server_port = self._get_cached_port("gcs_server_port")
+        if not connect_only and spawn_reaper and not self.kernel_fate_share:
+            self.start_reaper_process()
+        if not connect_only:
+            self._ray_params.update_pre_selected_port()
+        # Start processes.
+        if head:
+            self.start_head_processes()
+        if not connect_only:
+            self.start_ray_processes()
+            # we should update the address info after the node has been started
+            try:
+                ray._private.services.wait_for_node(
+                    self.gcs_address,
+                    self._plasma_store_socket_name,
+                )
+            except TimeoutError as te:
+                raise Exception(
+                    "The current node timed out during startup. This "
+                    "could happen because some of the Ray processes "
+                    "failed to startup."
+                ) from te
+            node_info = ray._private.services.get_node(
+                self.gcs_address,
+                self._node_id,
+            )
+            if self._ray_params.node_manager_port == 0:
+                self._ray_params.node_manager_port = node_info["node_manager_port"]
+        # Makes sure the Node object has valid addresses after setup.
+        self.validate_ip_port(self.address)
+        self.validate_ip_port(self.gcs_address)
+        if not connect_only:
+            self._record_stats()
+    def check_persisted_session_name(self):
+        if self._ray_params.external_addresses is None:
+            return None
+        self._redis_address = self._ray_params.external_addresses[0]
+        redis_ip_address, redis_port, enable_redis_ssl = get_address(
+            self._redis_address,
+        )
+        # Address is ip:port or redis://ip:port
+        if int(redis_port) < 0:
+            raise ValueError(
+                f"Invalid Redis port provided: {redis_port}."
+                "The port must be a non-negative integer."
+            )
+        return get_session_key_from_storage(
+            redis_ip_address,
+            int(redis_port),
+            self._ray_params.redis_username,
+            self._ray_params.redis_password,
+            enable_redis_ssl,
+            serialize_config(self._config),
+            b"session_name",
+        )
+    @staticmethod
+    def validate_ip_port(ip_port):
+        """Validates the address is in the ip:port format"""
+        _, _, port = ip_port.rpartition(":")
+        if port == ip_port:
+            raise ValueError(f"Port is not specified for address {ip_port}")
+        try:
+            _ = int(port)
+        except ValueError:
+            raise ValueError(
+                f"Unable to parse port number from {port} (full address = {ip_port})"
+            )
+    def check_version_info(self):
+        """Check if the Python and Ray version of this process matches that in GCS.
+        This will be used to detect if workers or drivers are started using
+        different versions of Python, or Ray.
+        Raises:
+            Exception: An exception is raised if there is a version mismatch.
+        """
+        import ray._private.usage.usage_lib as ray_usage_lib
+        cluster_metadata = ray_usage_lib.get_cluster_metadata(self.get_gcs_client())
+        if cluster_metadata is None:
+            cluster_metadata = ray_usage_lib.get_cluster_metadata(self.get_gcs_client())
+        if not cluster_metadata:
+            return
+        node_ip_address = ray._private.services.get_node_ip_address()
+        ray._private.utils.check_version_info(
+            cluster_metadata, f"node {node_ip_address}"
+        )
+    def _register_shutdown_hooks(self):
+        # Register the atexit handler. In this case, we shouldn't call sys.exit
+        # as we're already in the exit procedure.
+        def atexit_handler(*args):
+            self.kill_all_processes(check_alive=False, allow_graceful=True)
+        atexit.register(atexit_handler)
+        # Register the handler to be called if we get a SIGTERM.
+        # In this case, we want to exit with an error code (1) after
+        # cleaning up child processes.
+        def sigterm_handler(signum, frame):
+            self.kill_all_processes(check_alive=False, allow_graceful=True)
+            sys.exit(1)
+        ray._private.utils.set_sigterm_handler(sigterm_handler)
+    def _init_temp(self):
+        # Create a dictionary to store temp file index.
+        self._incremental_dict = collections.defaultdict(lambda: 0)
+        if self.head:
+            self._ray_params.update_if_absent(
+                temp_dir=ray._private.utils.get_ray_temp_dir()
+            )
+            self._temp_dir = self._ray_params.temp_dir
+        else:
+            if self._ray_params.temp_dir is None:
+                assert not self._default_worker
+                temp_dir = ray._private.utils.internal_kv_get_with_retry(
+                    self.get_gcs_client(),
+                    "temp_dir",
+                    ray_constants.KV_NAMESPACE_SESSION,
+                    num_retries=ray_constants.NUM_REDIS_GET_RETRIES,
+                )
+                self._temp_dir = ray._private.utils.decode(temp_dir)
+            else:
+                self._temp_dir = self._ray_params.temp_dir
+        try_to_create_directory(self._temp_dir)
+        if self.head:
+            self._session_dir = os.path.join(self._temp_dir, self._session_name)
+        else:
+            if self._temp_dir is None or self._session_name is None:
+                assert not self._default_worker
+                session_dir = ray._private.utils.internal_kv_get_with_retry(
+                    self.get_gcs_client(),
+                    "session_dir",
+                    ray_constants.KV_NAMESPACE_SESSION,
+                    num_retries=ray_constants.NUM_REDIS_GET_RETRIES,
+                )
+                self._session_dir = ray._private.utils.decode(session_dir)
+            else:
+                self._session_dir = os.path.join(self._temp_dir, self._session_name)
+        session_symlink = os.path.join(self._temp_dir, ray_constants.SESSION_LATEST)
+        # Send a warning message if the session exists.
+        try_to_create_directory(self._session_dir)
+        try_to_symlink(session_symlink, self._session_dir)
+        # Create a directory to be used for socket files.
+        self._sockets_dir = os.path.join(self._session_dir, "sockets")
+        try_to_create_directory(self._sockets_dir)
+        # Create a directory to be used for process log files.
+        self._logs_dir = os.path.join(self._session_dir, "logs")
+        try_to_create_directory(self._logs_dir)
+        old_logs_dir = os.path.join(self._logs_dir, "old")
+        try_to_create_directory(old_logs_dir)
+        # Create a directory to be used for runtime environment.
+        self._runtime_env_dir = os.path.join(
+            self._session_dir, self._ray_params.runtime_env_dir_name
+        )
+        try_to_create_directory(self._runtime_env_dir)
+    def _get_node_labels(self):
+        def merge_labels(env_override_labels, params_labels):
+            """Merges two dictionaries, picking from the
+            first in the event of a conflict. Also emit a warning on every
+            conflict.
+            """
+            result = params_labels.copy()
+            result.update(env_override_labels)
+            for key in set(env_override_labels.keys()).intersection(
+                set(params_labels.keys())
+            ):
+                if params_labels[key] != env_override_labels[key]:
+                    logger.warning(
+                        "Autoscaler is overriding your label:"
+                        f"{key}: {params_labels[key]} to "
+                        f"{key}: {env_override_labels[key]}."
+                    )
+            return result
+        env_override_labels = {}
+        env_override_labels_string = os.getenv(
+            ray_constants.LABELS_ENVIRONMENT_VARIABLE
+        )
+        if env_override_labels_string:
+            try:
+                env_override_labels = json.loads(env_override_labels_string)
+            except Exception:
+                logger.exception(f"Failed to load {env_override_labels_string}")
+                raise
+            logger.info(f"Autoscaler overriding labels: {env_override_labels}.")
+        return merge_labels(env_override_labels, self._ray_params.labels or {})
+    def get_resource_spec(self):
+        """Resolve and return the current resource spec for the node."""
+        def merge_resources(env_dict, params_dict):
+            """Separates special case params and merges two dictionaries, picking from the
+            first in the event of a conflict. Also emit a warning on every
+            conflict.
+            """
+            num_cpus = env_dict.pop("CPU", None)
+            num_gpus = env_dict.pop("GPU", None)
+            memory = env_dict.pop("memory", None)
+            object_store_memory = env_dict.pop("object_store_memory", None)
+            result = params_dict.copy()
+            result.update(env_dict)
+            for key in set(env_dict.keys()).intersection(set(params_dict.keys())):
+                if params_dict[key] != env_dict[key]:
+                    logger.warning(
+                        "Autoscaler is overriding your resource:"
+                        f"{key}: {params_dict[key]} with {env_dict[key]}."
+                    )
+            return num_cpus, num_gpus, memory, object_store_memory, result
+        if not self._resource_spec:
+            env_resources = {}
+            env_string = os.getenv(ray_constants.RESOURCES_ENVIRONMENT_VARIABLE)
+            if env_string:
+                try:
+                    env_resources = json.loads(env_string)
+                except Exception:
+                    logger.exception(f"Failed to load {env_string}")
+                    raise
+                logger.debug(f"Autoscaler overriding resources: {env_resources}.")
+            (
+                num_cpus,
+                num_gpus,
+                memory,
+                object_store_memory,
+                resources,
+            ) = merge_resources(env_resources, self._ray_params.resources)
+            self._resource_spec = ResourceSpec(
+                self._ray_params.num_cpus if num_cpus is None else num_cpus,
+                self._ray_params.num_gpus if num_gpus is None else num_gpus,
+                self._ray_params.memory if memory is None else memory,
+                (
+                    self._ray_params.object_store_memory
+                    if object_store_memory is None
+                    else object_store_memory
+                ),
+                resources,
+                self._ray_params.redis_max_memory,
+            ).resolve(is_head=self.head, node_ip_address=self.node_ip_address)
+        return self._resource_spec
+    @property
+    def node_id(self):
+        """Get the node ID."""
+        return self._node_id
+    @property
+    def session_name(self):
+        """Get the session name (cluster ID)."""
+        return self._session_name
+    @property
+    def node_ip_address(self):
+        """Get the IP address of this node."""
+        return self._node_ip_address
+    @property
+    def raylet_ip_address(self):
+        """Get the IP address of the raylet that this node connects to."""
+        return self._raylet_ip_address
+    @property
+    def address(self):
+        """Get the address for bootstrapping, e.g. the address to pass to
+        `ray start` or `ray.init()` to start worker nodes, that has been
+        converted to ip:port format.
+        """
+        return self._gcs_address
+    @property
+    def gcs_address(self):
+        """Get the gcs address."""
+        assert self._gcs_address is not None, "Gcs address is not set"
+        return self._gcs_address
+    @property
+    def redis_address(self):
+        """Get the cluster Redis address."""
+        return self._redis_address
+    @property
+    def redis_username(self):
+        """Get the cluster Redis username."""
+        return self._ray_params.redis_username
+    @property
+    def redis_password(self):
+        """Get the cluster Redis password."""
+        return self._ray_params.redis_password
+    @property
+    def object_ref_seed(self):
+        """Get the seed for deterministic generation of object refs"""
+        return self._ray_params.object_ref_seed
+    @property
+    def plasma_store_socket_name(self):
+        """Get the node's plasma store socket name."""
+        return self._plasma_store_socket_name
+    @property
+    def unique_id(self):
+        """Get a unique identifier for this node."""
+        return f"{self.node_ip_address}:{self._plasma_store_socket_name}"
+    @property
+    def webui_url(self):
+        """Get the cluster's web UI url."""
+        return self._webui_url
+    @property
+    def raylet_socket_name(self):
+        """Get the node's raylet socket name."""
+        return self._raylet_socket_name
+    @property
+    def node_manager_port(self):
+        """Get the node manager's port."""
+        return self._ray_params.node_manager_port
+    @property
+    def metrics_export_port(self):
+        """Get the port that exposes metrics"""
+        return self._metrics_export_port
+    @property
+    def runtime_env_agent_port(self):
+        """Get the port that exposes runtime env agent as http"""
+        return self._runtime_env_agent_port
+    @property
+    def runtime_env_agent_address(self):
+        """Get the address that exposes runtime env agent as http"""
+        return f"http://{self._raylet_ip_address}:{self._runtime_env_agent_port}"
+    @property
+    def dashboard_agent_listen_port(self):
+        """Get the dashboard agent's listen port"""
+        return self._dashboard_agent_listen_port
+    @property
+    def dashboard_grpc_port(self):
+        """Get the dashboard head grpc port"""
+        return self._dashboard_grpc_port
+    @property
+    def logging_config(self):
+        """Get the logging config of the current node."""
+        return {
+            "log_rotation_max_bytes": self.max_bytes,
+            "log_rotation_backup_count": self.backup_count,
+        }
+    @property
+    def address_info(self):
+        """Get a dictionary of addresses."""
+        return {
+            "node_ip_address": self._node_ip_address,
+            "raylet_ip_address": self._raylet_ip_address,
+            "redis_address": self.redis_address,
+            "object_store_address": self._plasma_store_socket_name,
+            "raylet_socket_name": self._raylet_socket_name,
+            "webui_url": self._webui_url,
+            "session_dir": self._session_dir,
+            "metrics_export_port": self._metrics_export_port,
+            "gcs_address": self.gcs_address,
+            "address": self.address,
+            "dashboard_agent_listen_port": self.dashboard_agent_listen_port,
+        }
+    def is_head(self):
+        return self.head
+    def get_gcs_client(self):
+        if self._gcs_client is None:
+            self._init_gcs_client()
+        return self._gcs_client
+    def _init_gcs_client(self):
+        if self.head:
+            gcs_process = self.all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER][
+                0
+            ].process
+        else:
+            gcs_process = None
+        # TODO(ryw) instead of create a new GcsClient, wrap the one from
+        # CoreWorkerProcess to save a grpc channel.
+        for _ in range(ray_constants.NUM_REDIS_GET_RETRIES):
+            gcs_address = None
+            last_ex = None
+            try:
+                gcs_address = self.gcs_address
+                client = GcsClient(
+                    address=gcs_address,
+                    cluster_id=self._ray_params.cluster_id,  # Hex string
+                )
+                self.cluster_id = client.cluster_id
+                if self.head:
+                    # Send a simple request to make sure GCS is alive
+                    # if it's a head node.
+                    client.internal_kv_get(b"dummy", None)
+                self._gcs_client = client
+                break
+            except Exception:
+                if gcs_process is not None and gcs_process.poll() is not None:
+                    # GCS has exited.
+                    break
+                last_ex = traceback.format_exc()
+                logger.debug(f"Connecting to GCS: {last_ex}")
+                time.sleep(1)
+        if self._gcs_client is None:
+            if hasattr(self, "_logs_dir"):
+                with open(os.path.join(self._logs_dir, "gcs_server.err")) as err:
+                    # Use " C " or " E " to exclude the stacktrace.
+                    # This should work for most cases, especitally
+                    # it's when GCS is starting. Only display last 10 lines of logs.
+                    errors = [e for e in err.readlines() if " C " in e or " E " in e][
+                        -10:
+                    ]
+                error_msg = "\n" + "".join(errors) + "\n"
+                raise RuntimeError(
+                    f"Failed to {'start' if self.head else 'connect to'} GCS. "
+                    f" Last {len(errors)} lines of error files:"
+                    f"{error_msg}."
+                    f"Please check {os.path.join(self._logs_dir, 'gcs_server.out')}"
+                    f" for details. Last connection error: {last_ex}"
+                )
+            else:
+                raise RuntimeError(
+                    f"Failed to {'start' if self.head else 'connect to'} GCS. Last "
+                    f"connection error: {last_ex}"
+                )
+        ray.experimental.internal_kv._initialize_internal_kv(self._gcs_client)
+    def get_temp_dir_path(self):
+        """Get the path of the temporary directory."""
+        return self._temp_dir
+    def get_runtime_env_dir_path(self):
+        """Get the path of the runtime env."""
+        return self._runtime_env_dir
+    def get_session_dir_path(self):
+        """Get the path of the session directory."""
+        return self._session_dir
+    def get_logs_dir_path(self):
+        """Get the path of the log files directory."""
+        return self._logs_dir
+    def get_sockets_dir_path(self):
+        """Get the path of the sockets directory."""
+        return self._sockets_dir
+    def _make_inc_temp(
+        self, suffix: str = "", prefix: str = "", directory_name: Optional[str] = None
+    ):
+        """Return an incremental temporary file name. The file is not created.
+        Args:
+            suffix: The suffix of the temp file.
+            prefix: The prefix of the temp file.
+            directory_name (str) : The base directory of the temp file.
+        Returns:
+            A string of file name. If there existing a file having
+                the same name, the returned name will look like
+                "{directory_name}/{prefix}.{unique_index}{suffix}"
+        """
+        if directory_name is None:
+            directory_name = ray._private.utils.get_ray_temp_dir()
+        directory_name = os.path.expanduser(directory_name)
+        index = self._incremental_dict[suffix, prefix, directory_name]
+        # `tempfile.TMP_MAX` could be extremely large,
+        # so using `range` in Python2.x should be avoided.
+        while index < tempfile.TMP_MAX:
+            if index == 0:
+                filename = os.path.join(directory_name, prefix + suffix)
+            else:
+                filename = os.path.join(
+                    directory_name, prefix + "." + str(index) + suffix
+                )
+            index += 1
+            if not os.path.exists(filename):
+                # Save the index.
+                self._incremental_dict[suffix, prefix, directory_name] = index
+                return filename
+        raise FileExistsError(errno.EEXIST, "No usable temporary filename found")
+    def should_redirect_logs(self):
+        redirect_output = self._ray_params.redirect_output
+        if redirect_output is None:
+            # Fall back to stderr redirect environment variable.
+            redirect_output = (
+                os.environ.get(
+                    ray_constants.LOGGING_REDIRECT_STDERR_ENVIRONMENT_VARIABLE
+                )
+                != "1"
+            )
+        return redirect_output
+    def get_log_file_names(
+        self,
+        name: str,
+        unique: bool = False,
+        create_out: bool = True,
+        create_err: bool = True,
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Get filename to dump logs for stdout and stderr, with no files opened.
+        If output redirection has been disabled, no files will
+        be opened and `(None, None)` will be returned.
+        Args:
+            name: descriptive string for this log file.
+            unique: if true, a counter will be attached to `name` to
+                ensure the returned filename is not already used.
+            create_out: if True, create a .out file.
+            create_err: if True, create a .err file.
+        Returns:
+            A tuple of two file handles for redirecting optional (stdout, stderr),
+            or `(None, None)` if output redirection is disabled.
+        """
+        if not self.should_redirect_logs():
+            return None, None
+        log_stdout = None
+        log_stderr = None
+        if create_out:
+            log_stdout = self._get_log_file_name(name, "out", unique=unique)
+        if create_err:
+            log_stderr = self._get_log_file_name(name, "err", unique=unique)
+        return log_stdout, log_stderr
+    def get_log_file_handles(
+        self,
+        name: str,
+        unique: bool = False,
+        create_out: bool = True,
+        create_err: bool = True,
+    ) -> Tuple[Optional[IO[AnyStr]], Optional[IO[AnyStr]]]:
+        """Open log files with partially randomized filenames, returning the
+        file handles. If output redirection has been disabled, no files will
+        be opened and `(None, None)` will be returned.
+        Args:
+            name: descriptive string for this log file.
+            unique: if true, a counter will be attached to `name` to
+                ensure the returned filename is not already used.
+            create_out: if True, create a .out file.
+            create_err: if True, create a .err file.
+        Returns:
+            A tuple of two file handles for redirecting optional (stdout, stderr),
+            or `(None, None)` if output redirection is disabled.
+        """
+        log_stdout_fname, log_stderr_fname = self.get_log_file_names(
+            name, unique=unique, create_out=create_out, create_err=create_err
+        )
+        log_stdout = None if log_stdout_fname is None else open_log(log_stdout_fname)
+        log_stderr = None if log_stderr_fname is None else open_log(log_stderr_fname)
+        return log_stdout, log_stderr
+    def _get_log_file_name(
+        self,
+        name: str,
+        suffix: str,
+        unique: bool = False,
+    ) -> str:
+        """Generate partially randomized filenames for log files.
+        Args:
+            name: descriptive string for this log file.
+            suffix: suffix of the file. Usually it is .out of .err.
+            unique: if true, a counter will be attached to `name` to
+                ensure the returned filename is not already used.
+        Returns:
+            A tuple of two file names for redirecting (stdout, stderr).
+        """
+        # strip if the suffix is something like .out.
+        suffix = suffix.strip(".")
+        if unique:
+            filename = self._make_inc_temp(
+                suffix=f".{suffix}", prefix=name, directory_name=self._logs_dir
+            )
+        else:
+            filename = os.path.join(self._logs_dir, f"{name}.{suffix}")
+        return filename
+    def _get_unused_port(self, allocated_ports=None):
+        if allocated_ports is None:
+            allocated_ports = set()
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.bind(("", 0))
+        port = s.getsockname()[1]
+        # Try to generate a port that is far above the 'next available' one.
+        # This solves issue #8254 where GRPC fails because the port assigned
+        # from this method has been used by a different process.
+        for _ in range(ray_constants.NUM_PORT_RETRIES):
+            new_port = random.randint(port, 65535)
+            if new_port in allocated_ports:
+                # This port is allocated for other usage already,
+                # so we shouldn't use it even if it's not in use right now.
+                continue
+            new_s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            try:
+                new_s.bind(("", new_port))
+            except OSError:
+                new_s.close()
+                continue
+            s.close()
+            new_s.close()
+            return new_port
+        logger.error("Unable to succeed in selecting a random port.")
+        s.close()
+        return port
+    def _prepare_socket_file(self, socket_path: str, default_prefix: str):
+        """Prepare the socket file for raylet and plasma.
+        This method helps to prepare a socket file.
+        1. Make the directory if the directory does not exist.
+        2. If the socket file exists, do nothing (this just means we aren't the
+           first worker on the node).
+        Args:
+            socket_path: the socket file to prepare.
+        """
+        result = socket_path
+        is_mac = sys.platform.startswith("darwin")
+        if sys.platform == "win32":
+            if socket_path is None:
+                result = f"tcp://{self._localhost}" f":{self._get_unused_port()}"
+        else:
+            if socket_path is None:
+                result = self._make_inc_temp(
+                    prefix=default_prefix, directory_name=self._sockets_dir
+                )
+            else:
+                try_to_create_directory(os.path.dirname(socket_path))
+            # Check socket path length to make sure it's short enough
+            maxlen = (104 if is_mac else 108) - 1  # sockaddr_un->sun_path
+            if len(result.split("://", 1)[-1].encode("utf-8")) > maxlen:
+                raise OSError(
+                    f"AF_UNIX path length cannot exceed {maxlen} bytes: {result!r}"
+                )
+        return result
+    def _get_cached_port(
+        self, port_name: str, default_port: Optional[int] = None
+    ) -> int:
+        """Get a port number from a cache on this node.
+        Different driver processes on a node should use the same ports for
+        some purposes, e.g. exporting metrics.  This method returns a port
+        number for the given port name and caches it in a file.  If the
+        port isn't already cached, an unused port is generated and cached.
+        Args:
+            port_name: the name of the port, e.g. metrics_export_port
+            default_port (Optional[int]): The port to return and cache if no
+            port has already been cached for the given port_name.  If None, an
+            unused port is generated and cached.
+        Returns:
+            port: the port number.
+        """
+        file_path = os.path.join(self.get_session_dir_path(), "ports_by_node.json")
+        # Make sure only the ports in RAY_CACHED_PORTS are cached.
+        assert port_name in ray_constants.RAY_ALLOWED_CACHED_PORTS
+        # Maps a Node.unique_id to a dict that maps port names to port numbers.
+        ports_by_node: Dict[str, Dict[str, int]] = defaultdict(dict)
+        with FileLock(file_path + ".lock"):
+            if not os.path.exists(file_path):
+                with open(file_path, "w") as f:
+                    json.dump({}, f)
+            with open(file_path, "r") as f:
+                ports_by_node.update(json.load(f))
+            if (
+                self.unique_id in ports_by_node
+                and port_name in ports_by_node[self.unique_id]
+            ):
+                # The port has already been cached at this node, so use it.
+                port = int(ports_by_node[self.unique_id][port_name])
+            else:
+                # Pick a new port to use and cache it at this node.
+                allocated_ports = set(ports_by_node[self.unique_id].values())
+                if default_port is not None and default_port in allocated_ports:
+                    # The default port is already in use, so don't use it.
+                    default_port = None
+                port = default_port or self._get_unused_port(allocated_ports)
+                ports_by_node[self.unique_id][port_name] = port
+                with open(file_path, "w") as f:
+                    json.dump(ports_by_node, f)
+        return port
+    def _wait_and_get_for_node_address(self, timeout_s: int = 60) -> str:
+        """Wait until the RAY_NODE_IP_FILENAME file is avialable.
+        RAY_NODE_IP_FILENAME is created when a ray instance is started.
+        Args:
+            timeout_s: If the ip address is not found within this
+                timeout, it will raise ValueError.
+        Returns:
+            The node_ip_address of the current session if it finds it
+            within timeout_s.
+        """
+        for i in range(timeout_s):
+            node_ip_address = ray._private.services.get_cached_node_ip_address(
+                self.get_session_dir_path()
+            )
+            if node_ip_address is not None:
+                return node_ip_address
+            time.sleep(1)
+            if i % 10 == 0:
+                logger.info(
+                    f"Can't find a `{ray_constants.RAY_NODE_IP_FILENAME}` "
+                    f"file from {self.get_session_dir_path()}. "
+                    "Have you started Ray instance using "
+                    "`ray start` or `ray.init`?"
+                )
+        raise ValueError(
+            f"Can't find a `{ray_constants.RAY_NODE_IP_FILENAME}` "
+            f"file from {self.get_session_dir_path()}. "
+            f"for {timeout_s} seconds. "
+            "A ray instance hasn't started. "
+            "Did you do `ray start` or `ray.init` on this host?"
+        )
+    def start_reaper_process(self):
+        """
+        Start the reaper process.
+        This must be the first process spawned and should only be called when
+        ray processes should be cleaned up if this process dies.
+        """
+        assert (
+            not self.kernel_fate_share
+        ), "a reaper should not be used with kernel fate-sharing"
+        process_info = ray._private.services.start_reaper(fate_share=False)
+        assert ray_constants.PROCESS_TYPE_REAPER not in self.all_processes
+        if process_info is not None:
+            self.all_processes[ray_constants.PROCESS_TYPE_REAPER] = [
+                process_info,
+            ]
+    def start_log_monitor(self):
+        """Start the log monitor."""
+        # Only redirect logs to .err. .err file is only useful when the
+        # component has an unexpected output to stdout/stderr.
+        _, stderr_file = self.get_log_file_handles(
+            "log_monitor", unique=True, create_out=False
+        )
+        process_info = ray._private.services.start_log_monitor(
+            self.get_session_dir_path(),
+            self._logs_dir,
+            self.gcs_address,
+            fate_share=self.kernel_fate_share,
+            max_bytes=self.max_bytes,
+            backup_count=self.backup_count,
+            redirect_logging=self.should_redirect_logs(),
+            stdout_file=stderr_file,
+            stderr_file=stderr_file,
+        )
+        assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes
+        self.all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] = [
+            process_info,
+        ]
+    def start_api_server(
+        self, *, include_dashboard: Optional[bool], raise_on_failure: bool
+    ):
+        """Start the dashboard.
+        Args:
+            include_dashboard: If true, this will load all dashboard-related modules
+                when starting the API server. Otherwise, it will only
+                start the modules that are not relevant to the dashboard.
+            raise_on_failure: If true, this will raise an exception
+                if we fail to start the API server. Otherwise it will print
+                a warning if we fail to start the API server.
+        """
+        # Only redirect logs to .err. .err file is only useful when the
+        # component has an unexpected output to stdout/stderr.
+        _, stderr_file = self.get_log_file_handles(
+            "dashboard", unique=True, create_out=False
+        )
+        self._webui_url, process_info = ray._private.services.start_api_server(
+            include_dashboard,
+            raise_on_failure,
+            self._ray_params.dashboard_host,
+            self.gcs_address,
+            self.cluster_id.hex(),
+            self._node_ip_address,
+            self._temp_dir,
+            self._logs_dir,
+            self._session_dir,
+            port=self._ray_params.dashboard_port,
+            dashboard_grpc_port=self._ray_params.dashboard_grpc_port,
+            fate_share=self.kernel_fate_share,
+            max_bytes=self.max_bytes,
+            backup_count=self.backup_count,
+            redirect_logging=self.should_redirect_logs(),
+            stdout_file=stderr_file,
+            stderr_file=stderr_file,
+        )
+        assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes
+        if process_info is not None:
+            self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD] = [
+                process_info,
+            ]
+            self.get_gcs_client().internal_kv_put(
+                b"webui:url",
+                self._webui_url.encode(),
+                True,
+                ray_constants.KV_NAMESPACE_DASHBOARD,
+            )
+    def start_gcs_server(self):
+        """Start the gcs server."""
+        gcs_server_port = self._ray_params.gcs_server_port
+        assert gcs_server_port > 0
+        assert self._gcs_address is None, "GCS server is already running."
+        assert self._gcs_client is None, "GCS client is already connected."
+        # TODO(hjiang): Update stderr to pass filename and get spdlog to handle
+        # logging as well.
+        stdout_log_fname, _ = self.get_log_file_names(
+            "gcs_server", unique=True, create_out=True, create_err=False
+        )
+        _, stderr_file = self.get_log_file_handles(
+            "gcs_server", unique=True, create_out=False, create_err=True
+        )
+        process_info = ray._private.services.start_gcs_server(
+            self.redis_address,
+            log_dir=self._logs_dir,
+            ray_log_filepath=stdout_log_fname,
+            stderr_file=stderr_file,
+            session_name=self.session_name,
+            redis_username=self._ray_params.redis_username,
+            redis_password=self._ray_params.redis_password,
+            config=self._config,
+            fate_share=self.kernel_fate_share,
+            gcs_server_port=gcs_server_port,
+            metrics_agent_port=self._ray_params.metrics_agent_port,
+            node_ip_address=self._node_ip_address,
+        )
+        assert ray_constants.PROCESS_TYPE_GCS_SERVER not in self.all_processes
+        self.all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] = [
+            process_info,
+        ]
+        # Connecting via non-localhost address may be blocked by firewall rule,
+        # e.g. https://github.com/ray-project/ray/issues/15780
+        # TODO(mwtian): figure out a way to use 127.0.0.1 for local connection
+        # when possible.
+        self._gcs_address = f"{self._node_ip_address}:" f"{gcs_server_port}"
+    def start_raylet(
+        self,
+        plasma_directory: str,
+        object_store_memory: int,
+        use_valgrind: bool = False,
+        use_profiler: bool = False,
+        enable_physical_mode: bool = False,
+    ):
+        """Start the raylet.
+        Args:
+            use_valgrind: True if we should start the process in
+                valgrind.
+            use_profiler: True if we should start the process in the
+                valgrind profiler.
+        """
+        stdout_log_fname, _ = self.get_log_file_names(
+            "raylet", unique=True, create_out=True, create_err=False
+        )
+        _, stderr_file = self.get_log_file_handles(
+            "raylet", unique=True, create_out=False, create_err=True
+        )
+        process_info = ray._private.services.start_raylet(
+            self.redis_address,
+            self.gcs_address,
+            self._node_id,
+            self._node_ip_address,
+            self._ray_params.node_manager_port,
+            self._raylet_socket_name,
+            self._plasma_store_socket_name,
+            self.cluster_id.hex(),
+            self._ray_params.worker_path,
+            self._ray_params.setup_worker_path,
+            self._ray_params.storage,
+            self._temp_dir,
+            self._session_dir,
+            self._runtime_env_dir,
+            self._logs_dir,
+            self.get_resource_spec(),
+            plasma_directory,
+            object_store_memory,
+            self.session_name,
+            is_head_node=self.is_head(),
+            min_worker_port=self._ray_params.min_worker_port,
+            max_worker_port=self._ray_params.max_worker_port,
+            worker_port_list=self._ray_params.worker_port_list,
+            object_manager_port=self._ray_params.object_manager_port,
+            redis_username=self._ray_params.redis_username,
+            redis_password=self._ray_params.redis_password,
+            metrics_agent_port=self._ray_params.metrics_agent_port,
+            runtime_env_agent_port=self._ray_params.runtime_env_agent_port,
+            metrics_export_port=self._metrics_export_port,
+            dashboard_agent_listen_port=self._ray_params.dashboard_agent_listen_port,
+            use_valgrind=use_valgrind,
+            use_profiler=use_profiler,
+            ray_log_filepath=stdout_log_fname,
+            stderr_file=stderr_file,
+            huge_pages=self._ray_params.huge_pages,
+            fate_share=self.kernel_fate_share,
+            socket_to_use=None,
+            max_bytes=self.max_bytes,
+            backup_count=self.backup_count,
+            ray_debugger_external=self._ray_params.ray_debugger_external,
+            env_updates=self._ray_params.env_vars,
+            node_name=self._ray_params.node_name,
+            webui=self._webui_url,
+            labels=self._get_node_labels(),
+            enable_physical_mode=enable_physical_mode,
+        )
+        assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
+        self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
+    def start_worker(self):
+        """Start a worker process."""
+        raise NotImplementedError
+    def start_monitor(self):
+        """Start the monitor.
+        Autoscaling output goes to these monitor.err/out files, and
+        any modification to these files may break existing
+        cluster launching commands.
+        """
+        from ray.autoscaler.v2.utils import is_autoscaler_v2
+        stdout_file, stderr_file = self.get_log_file_handles("monitor", unique=True)
+        process_info = ray._private.services.start_monitor(
+            self.gcs_address,
+            self._logs_dir,
+            stdout_file=stdout_file,
+            stderr_file=stderr_file,
+            autoscaling_config=self._ray_params.autoscaling_config,
+            fate_share=self.kernel_fate_share,
+            max_bytes=self.max_bytes,
+            backup_count=self.backup_count,
+            monitor_ip=self._node_ip_address,
+            autoscaler_v2=is_autoscaler_v2(fetch_from_server=True),
+        )
+        assert ray_constants.PROCESS_TYPE_MONITOR not in self.all_processes
+        self.all_processes[ray_constants.PROCESS_TYPE_MONITOR] = [process_info]
+    def start_ray_client_server(self):
+        """Start the ray client server process."""
+        stdout_file, stderr_file = self.get_log_file_handles(
+            "ray_client_server", unique=True
+        )
+        process_info = ray._private.services.start_ray_client_server(
+            self.address,
+            self._node_ip_address,
+            self._ray_params.ray_client_server_port,
+            stdout_file=stdout_file,
+            stderr_file=stderr_file,
+            redis_username=self._ray_params.redis_username,
+            redis_password=self._ray_params.redis_password,
+            fate_share=self.kernel_fate_share,
+            runtime_env_agent_address=self.runtime_env_agent_address,
+        )
+        assert ray_constants.PROCESS_TYPE_RAY_CLIENT_SERVER not in self.all_processes
+        self.all_processes[ray_constants.PROCESS_TYPE_RAY_CLIENT_SERVER] = [
+            process_info
+        ]
+    def _write_cluster_info_to_kv(self):
+        """Write the cluster metadata to GCS.
+        Cluster metadata is always recorded, but they are
+        not reported unless usage report is enabled.
+        Check `usage_stats_head.py` for more details.
+        """
+        # Make sure the cluster metadata wasn't reported before.
+        import ray._private.usage.usage_lib as ray_usage_lib
+        ray_usage_lib.put_cluster_metadata(
+            self.get_gcs_client(), ray_init_cluster=self.ray_init_cluster
+        )
+        # Make sure GCS is up.
+        added = self.get_gcs_client().internal_kv_put(
+            b"session_name",
+            self._session_name.encode(),
+            False,
+            ray_constants.KV_NAMESPACE_SESSION,
+        )
+        if not added:
+            curr_val = self.get_gcs_client().internal_kv_get(
+                b"session_name", ray_constants.KV_NAMESPACE_SESSION
+            )
+            assert curr_val == self._session_name.encode("utf-8"), (
+                f"Session name {self._session_name} does not match "
+                f"persisted value {curr_val}. Perhaps there was an "
+                f"error connecting to Redis."
+            )
+        self.get_gcs_client().internal_kv_put(
+            b"session_dir",
+            self._session_dir.encode(),
+            True,
+            ray_constants.KV_NAMESPACE_SESSION,
+        )
+        self.get_gcs_client().internal_kv_put(
+            b"temp_dir",
+            self._temp_dir.encode(),
+            True,
+            ray_constants.KV_NAMESPACE_SESSION,
+        )
+        if self._ray_params.storage is not None:
+            self.get_gcs_client().internal_kv_put(
+                b"storage",
+                self._ray_params.storage.encode(),
+                True,
+                ray_constants.KV_NAMESPACE_SESSION,
+            )
+        # Add tracing_startup_hook to redis / internal kv manually
+        # since internal kv is not yet initialized.
+        if self._ray_params.tracing_startup_hook:
+            self.get_gcs_client().internal_kv_put(
+                b"tracing_startup_hook",
+                self._ray_params.tracing_startup_hook.encode(),
+                True,
+                ray_constants.KV_NAMESPACE_TRACING,
+            )
+    def start_head_processes(self):
+        """Start head processes on the node."""
+        logger.debug(
+            f"Process STDOUT and STDERR is being " f"redirected to {self._logs_dir}."
+        )
+        assert self._gcs_address is None
+        assert self._gcs_client is None
+        self.start_gcs_server()
+        assert self.get_gcs_client() is not None
+        self._write_cluster_info_to_kv()
+        if not self._ray_params.no_monitor:
+            self.start_monitor()
+        if self._ray_params.ray_client_server_port:
+            self.start_ray_client_server()
+        if self._ray_params.include_dashboard is None:
+            # Default
+            raise_on_api_server_failure = False
+        else:
+            raise_on_api_server_failure = self._ray_params.include_dashboard
+        self.start_api_server(
+            include_dashboard=self._ray_params.include_dashboard,
+            raise_on_failure=raise_on_api_server_failure,
+        )
+    def start_ray_processes(self):
+        """Start all of the processes on the node."""
+        logger.debug(
+            f"Process STDOUT and STDERR is being " f"redirected to {self._logs_dir}."
+        )
+        if not self.head:
+            # Get the system config from GCS first if this is a non-head node.
+            gcs_options = ray._raylet.GcsClientOptions.create(
+                self.gcs_address,
+                self.cluster_id.hex(),
+                allow_cluster_id_nil=False,
+                fetch_cluster_id_if_nil=False,
+            )
+            global_state = ray._private.state.GlobalState()
+            global_state._initialize_global_state(gcs_options)
+            new_config = global_state.get_system_config()
+            assert self._config.items() <= new_config.items(), (
+                "The system config from GCS is not a superset of the local"
+                " system config. There might be a configuration inconsistency"
+                " issue between the head node and non-head nodes."
+                f" Local system config: {self._config},"
+                f" GCS system config: {new_config}"
+            )
+            self._config = new_config
+        # Make sure we don't call `determine_plasma_store_config` multiple
+        # times to avoid printing multiple warnings.
+        resource_spec = self.get_resource_spec()
+        (
+            plasma_directory,
+            object_store_memory,
+        ) = ray._private.services.determine_plasma_store_config(
+            resource_spec.object_store_memory,
+            plasma_directory=self._ray_params.plasma_directory,
+            huge_pages=self._ray_params.huge_pages,
+        )
+        self.start_raylet(plasma_directory, object_store_memory)
+        if self._ray_params.include_log_monitor:
+            self.start_log_monitor()
+    def _kill_process_type(
+        self,
+        process_type,
+        allow_graceful: bool = False,
+        check_alive: bool = True,
+        wait: bool = False,
+    ):
+        """Kill a process of a given type.
+        If the process type is PROCESS_TYPE_REDIS_SERVER, then we will kill all
+        of the Redis servers.
+        If the process was started in valgrind, then we will raise an exception
+        if the process has a non-zero exit code.
+        Args:
+            process_type: The type of the process to kill.
+            allow_graceful: Send a SIGTERM first and give the process
+                time to exit gracefully. If that doesn't work, then use
+                SIGKILL. We usually want to do this outside of tests.
+            check_alive: If true, then we expect the process to be alive
+                and will raise an exception if the process is already dead.
+            wait: If true, then this method will not return until the
+                process in question has exited.
+        Raises:
+            This process raises an exception in the following cases:
+                1. The process had already died and check_alive is true.
+                2. The process had been started in valgrind and had a non-zero
+                   exit code.
+        """
+        # Ensure thread safety
+        with self.removal_lock:
+            self._kill_process_impl(
+                process_type,
+                allow_graceful=allow_graceful,
+                check_alive=check_alive,
+                wait=wait,
+            )
+    def _kill_process_impl(
+        self, process_type, allow_graceful=False, check_alive=True, wait=False
+    ):
+        """See `_kill_process_type`."""
+        if process_type not in self.all_processes:
+            return
+        process_infos = self.all_processes[process_type]
+        if process_type != ray_constants.PROCESS_TYPE_REDIS_SERVER:
+            assert len(process_infos) == 1
+        for process_info in process_infos:
+            process = process_info.process
+            # Handle the case where the process has already exited.
+            if process.poll() is not None:
+                if check_alive:
+                    raise RuntimeError(
+                        "Attempting to kill a process of type "
+                        f"'{process_type}', but this process is already dead."
+                    )
+                else:
+                    continue
+            if process_info.use_valgrind:
+                process.terminate()
+                process.wait()
+                if process.returncode != 0:
+                    message = (
+                        "Valgrind detected some errors in process of "
+                        f"type {process_type}. Error code {process.returncode}."
+                    )
+                    if process_info.stdout_file is not None:
+                        with open(process_info.stdout_file, "r") as f:
+                            message += "\nPROCESS STDOUT:\n" + f.read()
+                    if process_info.stderr_file is not None:
+                        with open(process_info.stderr_file, "r") as f:
+                            message += "\nPROCESS STDERR:\n" + f.read()
+                    raise RuntimeError(message)
+                continue
+            if process_info.use_valgrind_profiler:
+                # Give process signal to write profiler data.
+                os.kill(process.pid, signal.SIGINT)
+                # Wait for profiling data to be written.
+                time.sleep(0.1)
+            if allow_graceful:
+                process.terminate()
+                # Allow the process one second to exit gracefully.
+                timeout_seconds = 1
+                try:
+                    process.wait(timeout_seconds)
+                except subprocess.TimeoutExpired:
+                    pass
+            # If the process did not exit, force kill it.
+            if process.poll() is None:
+                process.kill()
+                # The reason we usually don't call process.wait() here is that
+                # there's some chance we'd end up waiting a really long time.
+                if wait:
+                    process.wait()
+        del self.all_processes[process_type]
+    def kill_redis(self, check_alive: bool = True):
+        """Kill the Redis servers.
+        Args:
+            check_alive: Raise an exception if any of the processes
+                were already dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_REDIS_SERVER, check_alive=check_alive
+        )
+    def kill_raylet(self, check_alive: bool = True):
+        """Kill the raylet.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_RAYLET, check_alive=check_alive
+        )
+    def kill_log_monitor(self, check_alive: bool = True):
+        """Kill the log monitor.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_LOG_MONITOR, check_alive=check_alive
+        )
+    def kill_reporter(self, check_alive: bool = True):
+        """Kill the reporter.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_REPORTER, check_alive=check_alive
+        )
+    def kill_dashboard(self, check_alive: bool = True):
+        """Kill the dashboard.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_DASHBOARD, check_alive=check_alive
+        )
+    def kill_monitor(self, check_alive: bool = True):
+        """Kill the monitor.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_MONITOR, check_alive=check_alive
+        )
+    def kill_gcs_server(self, check_alive: bool = True):
+        """Kill the gcs server.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_GCS_SERVER, check_alive=check_alive, wait=True
+        )
+        # Clear GCS client and address to indicate no GCS server is running.
+        self._gcs_address = None
+        self._gcs_client = None
+    def kill_reaper(self, check_alive: bool = True):
+        """Kill the reaper process.
+        Args:
+            check_alive: Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_REAPER, check_alive=check_alive
+        )
+    def kill_all_processes(self, check_alive=True, allow_graceful=False, wait=False):
+        """Kill all of the processes.
+        Note that This is slower than necessary because it calls kill, wait,
+        kill, wait, ... instead of kill, kill, ..., wait, wait, ...
+        Args:
+            check_alive: Raise an exception if any of the processes were
+                already dead.
+            wait: If true, then this method will not return until the
+                process in question has exited.
+        """
+        # Kill the raylet first. This is important for suppressing errors at
+        # shutdown because we give the raylet a chance to exit gracefully and
+        # clean up its child worker processes. If we were to kill the plasma
+        # store (or Redis) first, that could cause the raylet to exit
+        # ungracefully, leading to more verbose output from the workers.
+        if ray_constants.PROCESS_TYPE_RAYLET in self.all_processes:
+            self._kill_process_type(
+                ray_constants.PROCESS_TYPE_RAYLET,
+                check_alive=check_alive,
+                allow_graceful=allow_graceful,
+                wait=wait,
+            )
+        if ray_constants.PROCESS_TYPE_GCS_SERVER in self.all_processes:
+            self._kill_process_type(
+                ray_constants.PROCESS_TYPE_GCS_SERVER,
+                check_alive=check_alive,
+                allow_graceful=allow_graceful,
+                wait=wait,
+            )
+        # We call "list" to copy the keys because we are modifying the
+        # dictionary while iterating over it.
+        for process_type in list(self.all_processes.keys()):
+            # Need to kill the reaper process last in case we die unexpectedly
+            # while cleaning up.
+            if process_type != ray_constants.PROCESS_TYPE_REAPER:
+                self._kill_process_type(
+                    process_type,
+                    check_alive=check_alive,
+                    allow_graceful=allow_graceful,
+                    wait=wait,
+                )
+        if ray_constants.PROCESS_TYPE_REAPER in self.all_processes:
+            self._kill_process_type(
+                ray_constants.PROCESS_TYPE_REAPER,
+                check_alive=check_alive,
+                allow_graceful=allow_graceful,
+                wait=wait,
+            )
+    def live_processes(self):
+        """Return a list of the live processes.
+        Returns:
+            A list of the live processes.
+        """
+        result = []
+        for process_type, process_infos in self.all_processes.items():
+            for process_info in process_infos:
+                if process_info.process.poll() is None:
+                    result.append((process_type, process_info.process))
+        return result
+    def dead_processes(self):
+        """Return a list of the dead processes.
+        Note that this ignores processes that have been explicitly killed,
+        e.g., via a command like node.kill_raylet().
+        Returns:
+            A list of the dead processes ignoring the ones that have been
+                explicitly killed.
+        """
+        result = []
+        for process_type, process_infos in self.all_processes.items():
+            for process_info in process_infos:
+                if process_info.process.poll() is not None:
+                    result.append((process_type, process_info.process))
+        return result
+    def any_processes_alive(self):
+        """Return true if any processes are still alive.
+        Returns:
+            True if any process is still alive.
+        """
+        return any(self.live_processes())
+    def remaining_processes_alive(self):
+        """Return true if all remaining processes are still alive.
+        Note that this ignores processes that have been explicitly killed,
+        e.g., via a command like node.kill_raylet().
+        Returns:
+            True if any process that wasn't explicitly killed is still alive.
+        """
+        return not any(self.dead_processes())
+    def destroy_external_storage(self):
+        object_spilling_config = self._config.get("object_spilling_config", {})
+        if object_spilling_config:
+            object_spilling_config = json.loads(object_spilling_config)
+            from ray._private import external_storage
+            storage = external_storage.setup_external_storage(
+                object_spilling_config, self._node_id, self._session_name
+            )
+            storage.destroy_external_storage()
+    def validate_external_storage(self):
+        """Make sure we can setup the object spilling external storage.
+        This will also fill up the default setting for object spilling
+        if not specified.
+        """
+        object_spilling_config = self._config.get("object_spilling_config", {})
+        automatic_spilling_enabled = self._config.get(
+            "automatic_object_spilling_enabled", True
+        )
+        if not automatic_spilling_enabled:
+            return
+        if not object_spilling_config:
+            object_spilling_config = os.environ.get("RAY_object_spilling_config", "")
+        # If the config is not specified, we fill up the default.
+        if not object_spilling_config:
+            object_spilling_config = json.dumps(
+                {"type": "filesystem", "params": {"directory_path": self._session_dir}}
+            )
+        # Try setting up the storage.
+        # Configure the proper system config.
+        # We need to set both ray param's system config and self._config
+        # because they could've been diverged at this point.
+        deserialized_config = json.loads(object_spilling_config)
+        self._ray_params._system_config[
+            "object_spilling_config"
+        ] = object_spilling_config
+        self._config["object_spilling_config"] = object_spilling_config
+        is_external_storage_type_fs = deserialized_config["type"] == "filesystem"
+        self._ray_params._system_config[
+            "is_external_storage_type_fs"
+        ] = is_external_storage_type_fs
+        self._config["is_external_storage_type_fs"] = is_external_storage_type_fs
+        # Validate external storage usage.
+        from ray._private import external_storage
+        # Node ID is available only after GCS is connected. However,
+        # validate_external_storage() needs to be called before it to
+        # be able to validate the configs early. Therefore, we use a
+        # dummy node ID here and make sure external storage can be set
+        # up based on the provided config. This storage is destroyed
+        # right after the validation.
+        dummy_node_id = ray.NodeID.from_random().hex()
+        storage = external_storage.setup_external_storage(
+            deserialized_config, dummy_node_id, self._session_name
+        )
+        storage.destroy_external_storage()
+        external_storage.reset_external_storage()
+    def _record_stats(self):
+        # This is only called when a new node is started.
+        # Initialize the internal kv so that the metrics can be put
+        from ray._private.usage.usage_lib import (
+            TagKey,
+            record_extra_usage_tag,
+            record_hardware_usage,
+        )
+        if not ray.experimental.internal_kv._internal_kv_initialized():
+            ray.experimental.internal_kv._initialize_internal_kv(self.get_gcs_client())
+        assert ray.experimental.internal_kv._internal_kv_initialized()
+        if self.head:
+            # record head node stats
+            gcs_storage_type = (
+                "redis" if os.environ.get("RAY_REDIS_ADDRESS") is not None else "memory"
+            )
+            record_extra_usage_tag(TagKey.GCS_STORAGE, gcs_storage_type)
+        cpu_model_name = ray._private.utils.get_current_node_cpu_model_name()
+        if cpu_model_name:
+            # CPU model name can be an arbitrary long string
+            # so we truncate it to the first 50 characters
+            # to avoid any issues.
+            record_hardware_usage(cpu_model_name[:50])

.venv/lib/python3.11/site-packages/ray/_private/parameter.py ADDED Viewed

	@@ -0,0 +1,483 @@

+import logging
+import os
+from typing import Dict, List, Optional
+import ray._private.ray_constants as ray_constants
+from ray._private.utils import (
+    validate_node_labels,
+    check_ray_client_dependencies_installed,
+)
+logger = logging.getLogger(__name__)
+class RayParams:
+    """A class used to store the parameters used by Ray.
+    Attributes:
+        redis_address: The address of the Redis server to connect to. If
+            this address is not provided, then this command will start Redis, a
+            raylet, a plasma store, a plasma manager, and some workers.
+            It will also kill these processes when Python exits.
+        redis_port: The port that the primary Redis shard should listen
+            to. If None, then it will fall back to
+            ray._private.ray_constants.DEFAULT_PORT, or a random port if the default is
+            not available.
+        redis_shard_ports: A list of the ports to use for the non-primary Redis
+            shards. If None, then it will fall back to the ports right after
+            redis_port, or random ports if those are not available.
+        num_cpus: Number of CPUs to configure the raylet with.
+        num_gpus: Number of GPUs to configure the raylet with.
+        resources: A dictionary mapping the name of a resource to the quantity
+            of that resource available.
+        labels: The key-value labels of the node.
+        memory: Total available memory for workers requesting memory.
+        object_store_memory: The amount of memory (in bytes) to start the
+            object store with.
+        redis_max_memory: The max amount of memory (in bytes) to allow redis
+            to use, or None for no limit. Once the limit is exceeded, redis
+            will start LRU eviction of entries. This only applies to the
+            sharded redis tables (task and object tables).
+        object_manager_port int: The port to use for the object manager.
+        node_manager_port: The port to use for the node manager.
+        gcs_server_port: The port to use for the GCS server.
+        node_ip_address: The IP address of the node that we are on.
+        raylet_ip_address: The IP address of the raylet that this node
+            connects to.
+        min_worker_port: The lowest port number that workers will bind
+            on. If not set or set to 0, random ports will be chosen.
+        max_worker_port: The highest port number that workers will bind
+            on. If set, min_worker_port must also be set.
+        worker_port_list: An explicit list of ports to be used for
+            workers (comma-separated). Overrides min_worker_port and
+            max_worker_port.
+        ray_client_server_port: The port number the ray client server
+            will bind on. If not set, the ray client server will not
+            be started.
+        object_ref_seed: Used to seed the deterministic generation of
+            object refs. The same value can be used across multiple runs of the
+            same job in order to generate the object refs in a consistent
+            manner. However, the same ID should not be used for different jobs.
+        redirect_output: True if stdout and stderr for non-worker
+            processes should be redirected to files and false otherwise.
+        external_addresses: The address of external Redis server to
+            connect to, in format of "ip1:port1,ip2:port2,...".  If this
+            address is provided, then ray won't start Redis instances in the
+            head node but use external Redis server(s) instead.
+        num_redis_shards: The number of Redis shards to start in addition to
+            the primary Redis shard.
+        redis_max_clients: If provided, attempt to configure Redis with this
+            maxclients number.
+        redis_username: Prevents external clients without the username
+            from connecting to Redis if provided.
+        redis_password: Prevents external clients without the password
+            from connecting to Redis if provided.
+        plasma_directory: A directory where the Plasma memory mapped files will
+            be created.
+        worker_path: The path of the source code that will be run by the
+            worker.
+        setup_worker_path: The path of the Python file that will set up
+            the environment for the worker process.
+        huge_pages: Boolean flag indicating whether to start the Object
+            Store with hugetlbfs support. Requires plasma_directory.
+        include_dashboard: Boolean flag indicating whether to start the web
+            UI, which displays the status of the Ray cluster. If this value is
+            None, then the UI will be started if the relevant dependencies are
+            present.
+        dashboard_host: The host to bind the web UI server to. Can either be
+            localhost (127.0.0.1) or 0.0.0.0 (available from all interfaces).
+            By default, this is set to localhost to prevent access from
+            external machines.
+        dashboard_port: The port to bind the dashboard server to.
+            Defaults to 8265.
+        dashboard_agent_listen_port: The port for dashboard agents to listen on
+            for HTTP requests.
+            Defaults to 52365.
+        dashboard_grpc_port: The port for the dashboard head process to listen
+            for gRPC on.
+            Defaults to random available port.
+        runtime_env_agent_port: The port at which the runtime env agent
+            listens to for HTTP.
+            Defaults to random available port.
+        plasma_store_socket_name: If provided, it specifies the socket
+            name used by the plasma store.
+        raylet_socket_name: If provided, it specifies the socket path
+            used by the raylet process.
+        temp_dir: If provided, it will specify the root temporary
+            directory for the Ray process. Must be an absolute path.
+        storage: Specify a URI for persistent cluster-wide storage. This storage path
+            must be accessible by all nodes of the cluster, otherwise an error will be
+            raised.
+        runtime_env_dir_name: If provided, specifies the directory that
+            will be created in the session dir to hold runtime_env files.
+        include_log_monitor: If True, then start a log monitor to
+            monitor the log files for all processes on this node and push their
+            contents to Redis.
+        autoscaling_config: path to autoscaling config file.
+        metrics_agent_port: The port to bind metrics agent.
+        metrics_export_port: The port at which metrics are exposed
+            through a Prometheus endpoint.
+        no_monitor: If True, the ray autoscaler monitor for this cluster
+            will not be started.
+        _system_config: Configuration for overriding RayConfig
+            defaults. Used to set system configuration and for experimental Ray
+            core feature flags.
+        enable_object_reconstruction: Enable plasma reconstruction on
+            failure.
+        ray_debugger_external: If true, make the Ray debugger for a
+            worker available externally to the node it is running on. This will
+            bind on 0.0.0.0 instead of localhost.
+        env_vars: Override environment variables for the raylet.
+        session_name: The name of the session of the ray cluster.
+        webui: The url of the UI.
+        cluster_id: The cluster ID in hex string.
+        enable_physical_mode: Whether physical mode is enabled, which applies
+            constraint to tasks' resource consumption. As of now, only memory resource
+            is supported.
+    """
+    def __init__(
+        self,
+        redis_address: Optional[str] = None,
+        gcs_address: Optional[str] = None,
+        num_cpus: Optional[int] = None,
+        num_gpus: Optional[int] = None,
+        resources: Optional[Dict[str, float]] = None,
+        labels: Optional[Dict[str, str]] = None,
+        memory: Optional[float] = None,
+        object_store_memory: Optional[float] = None,
+        redis_max_memory: Optional[float] = None,
+        redis_port: Optional[int] = None,
+        redis_shard_ports: Optional[List[int]] = None,
+        object_manager_port: Optional[int] = None,
+        node_manager_port: int = 0,
+        gcs_server_port: Optional[int] = None,
+        node_ip_address: Optional[str] = None,
+        node_name: Optional[str] = None,
+        raylet_ip_address: Optional[str] = None,
+        min_worker_port: Optional[int] = None,
+        max_worker_port: Optional[int] = None,
+        worker_port_list: Optional[List[int]] = None,
+        ray_client_server_port: Optional[int] = None,
+        object_ref_seed: Optional[int] = None,
+        driver_mode=None,
+        redirect_output: Optional[bool] = None,
+        external_addresses: Optional[List[str]] = None,
+        num_redis_shards: Optional[int] = None,
+        redis_max_clients: Optional[int] = None,
+        redis_username: Optional[str] = ray_constants.REDIS_DEFAULT_USERNAME,
+        redis_password: Optional[str] = ray_constants.REDIS_DEFAULT_PASSWORD,
+        plasma_directory: Optional[str] = None,
+        worker_path: Optional[str] = None,
+        setup_worker_path: Optional[str] = None,
+        huge_pages: Optional[bool] = False,
+        include_dashboard: Optional[bool] = None,
+        dashboard_host: Optional[str] = ray_constants.DEFAULT_DASHBOARD_IP,
+        dashboard_port: Optional[bool] = ray_constants.DEFAULT_DASHBOARD_PORT,
+        dashboard_agent_listen_port: Optional[
+            int
+        ] = ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
+        runtime_env_agent_port: Optional[int] = None,
+        dashboard_grpc_port: Optional[int] = None,
+        plasma_store_socket_name: Optional[str] = None,
+        raylet_socket_name: Optional[str] = None,
+        temp_dir: Optional[str] = None,
+        storage: Optional[str] = None,
+        runtime_env_dir_name: Optional[str] = None,
+        include_log_monitor: Optional[str] = None,
+        autoscaling_config: Optional[str] = None,
+        ray_debugger_external: bool = False,
+        _system_config: Optional[Dict[str, str]] = None,
+        enable_object_reconstruction: Optional[bool] = False,
+        metrics_agent_port: Optional[int] = None,
+        metrics_export_port: Optional[int] = None,
+        tracing_startup_hook=None,
+        no_monitor: Optional[bool] = False,
+        env_vars: Optional[Dict[str, str]] = None,
+        session_name: Optional[str] = None,
+        webui: Optional[str] = None,
+        cluster_id: Optional[str] = None,
+        node_id: Optional[str] = None,
+        enable_physical_mode: bool = False,
+    ):
+        self.redis_address = redis_address
+        self.gcs_address = gcs_address
+        self.num_cpus = num_cpus
+        self.num_gpus = num_gpus
+        self.memory = memory
+        self.object_store_memory = object_store_memory
+        self.resources = resources
+        self.redis_max_memory = redis_max_memory
+        self.redis_port = redis_port
+        self.redis_shard_ports = redis_shard_ports
+        self.object_manager_port = object_manager_port
+        self.node_manager_port = node_manager_port
+        self.gcs_server_port = gcs_server_port
+        self.node_ip_address = node_ip_address
+        self.node_name = node_name
+        self.raylet_ip_address = raylet_ip_address
+        self.min_worker_port = min_worker_port
+        self.max_worker_port = max_worker_port
+        self.worker_port_list = worker_port_list
+        self.ray_client_server_port = ray_client_server_port
+        self.driver_mode = driver_mode
+        self.redirect_output = redirect_output
+        self.external_addresses = external_addresses
+        self.num_redis_shards = num_redis_shards
+        self.redis_max_clients = redis_max_clients
+        self.redis_username = redis_username
+        self.redis_password = redis_password
+        self.plasma_directory = plasma_directory
+        self.worker_path = worker_path
+        self.setup_worker_path = setup_worker_path
+        self.huge_pages = huge_pages
+        self.include_dashboard = include_dashboard
+        self.dashboard_host = dashboard_host
+        self.dashboard_port = dashboard_port
+        self.dashboard_agent_listen_port = dashboard_agent_listen_port
+        self.dashboard_grpc_port = dashboard_grpc_port
+        self.runtime_env_agent_port = runtime_env_agent_port
+        self.plasma_store_socket_name = plasma_store_socket_name
+        self.raylet_socket_name = raylet_socket_name
+        self.temp_dir = temp_dir
+        self.storage = storage or os.environ.get(
+            ray_constants.RAY_STORAGE_ENVIRONMENT_VARIABLE
+        )
+        self.runtime_env_dir_name = (
+            runtime_env_dir_name or ray_constants.DEFAULT_RUNTIME_ENV_DIR_NAME
+        )
+        self.include_log_monitor = include_log_monitor
+        self.autoscaling_config = autoscaling_config
+        self.metrics_agent_port = metrics_agent_port
+        self.metrics_export_port = metrics_export_port
+        self.tracing_startup_hook = tracing_startup_hook
+        self.no_monitor = no_monitor
+        self.object_ref_seed = object_ref_seed
+        self.ray_debugger_external = ray_debugger_external
+        self.env_vars = env_vars
+        self.session_name = session_name
+        self.webui = webui
+        self._system_config = _system_config or {}
+        self._enable_object_reconstruction = enable_object_reconstruction
+        self.labels = labels
+        self._check_usage()
+        self.cluster_id = cluster_id
+        self.node_id = node_id
+        self.enable_physical_mode = enable_physical_mode
+        # Set the internal config options for object reconstruction.
+        if enable_object_reconstruction:
+            # Turn off object pinning.
+            if self._system_config is None:
+                self._system_config = dict()
+            print(self._system_config)
+            self._system_config["lineage_pinning_enabled"] = True
+    def update(self, **kwargs):
+        """Update the settings according to the keyword arguments.
+        Args:
+            kwargs: The keyword arguments to set corresponding fields.
+        """
+        for arg in kwargs:
+            if hasattr(self, arg):
+                setattr(self, arg, kwargs[arg])
+            else:
+                raise ValueError(f"Invalid RayParams parameter in update: {arg}")
+        self._check_usage()
+    def update_if_absent(self, **kwargs):
+        """Update the settings when the target fields are None.
+        Args:
+            kwargs: The keyword arguments to set corresponding fields.
+        """
+        for arg in kwargs:
+            if hasattr(self, arg):
+                if getattr(self, arg) is None:
+                    setattr(self, arg, kwargs[arg])
+            else:
+                raise ValueError(
+                    f"Invalid RayParams parameter in update_if_absent: {arg}"
+                )
+        self._check_usage()
+    def update_pre_selected_port(self):
+        """Update the pre-selected port information
+        Returns:
+            The dictionary mapping of component -> ports.
+        """
+        def wrap_port(port):
+            # 0 port means select a random port for the grpc server.
+            if port is None or port == 0:
+                return []
+            else:
+                return [port]
+        # Create a dictionary of the component -> port mapping.
+        pre_selected_ports = {
+            "gcs": wrap_port(self.redis_port),
+            "object_manager": wrap_port(self.object_manager_port),
+            "node_manager": wrap_port(self.node_manager_port),
+            "gcs_server": wrap_port(self.gcs_server_port),
+            "client_server": wrap_port(self.ray_client_server_port),
+            "dashboard": wrap_port(self.dashboard_port),
+            "dashboard_agent_grpc": wrap_port(self.metrics_agent_port),
+            "dashboard_agent_http": wrap_port(self.dashboard_agent_listen_port),
+            "dashboard_grpc": wrap_port(self.dashboard_grpc_port),
+            "runtime_env_agent": wrap_port(self.runtime_env_agent_port),
+            "metrics_export": wrap_port(self.metrics_export_port),
+        }
+        redis_shard_ports = self.redis_shard_ports
+        if redis_shard_ports is None:
+            redis_shard_ports = []
+        pre_selected_ports["redis_shards"] = redis_shard_ports
+        if self.worker_port_list is None:
+            if self.min_worker_port is not None and self.max_worker_port is not None:
+                pre_selected_ports["worker_ports"] = list(
+                    range(self.min_worker_port, self.max_worker_port + 1)
+                )
+            else:
+                # The dict is not updated when it requires random ports.
+                pre_selected_ports["worker_ports"] = []
+        else:
+            pre_selected_ports["worker_ports"] = [
+                int(port) for port in self.worker_port_list.split(",")
+            ]
+        # Update the pre selected port set.
+        self.reserved_ports = set()
+        for comp, port_list in pre_selected_ports.items():
+            for port in port_list:
+                if port in self.reserved_ports:
+                    raise ValueError(
+                        f"Ray component {comp} is trying to use "
+                        f"a port number {port} that is used by other components.\n"
+                        f"Port information: {self._format_ports(pre_selected_ports)}\n"
+                        "If you allocate ports, please make sure the same port "
+                        "is not used by multiple components."
+                    )
+                self.reserved_ports.add(port)
+    def _check_usage(self):
+        if self.worker_port_list is not None:
+            for port_str in self.worker_port_list.split(","):
+                try:
+                    port = int(port_str)
+                except ValueError as e:
+                    raise ValueError(
+                        "worker_port_list must be a comma-separated "
+                        f"list of integers: {e}"
+                    ) from None
+                if port < 1024 or port > 65535:
+                    raise ValueError(
+                        "Ports in worker_port_list must be "
+                        f"between 1024 and 65535. Got: {port}"
+                    )
+        # Used primarily for testing.
+        if os.environ.get("RAY_USE_RANDOM_PORTS", False):
+            if self.min_worker_port is None and self.max_worker_port is None:
+                self.min_worker_port = 0
+                self.max_worker_port = 0
+        if self.min_worker_port is not None:
+            if self.min_worker_port != 0 and (
+                self.min_worker_port < 1024 or self.min_worker_port > 65535
+            ):
+                raise ValueError(
+                    "min_worker_port must be 0 or an integer between 1024 and 65535."
+                )
+        if self.max_worker_port is not None:
+            if self.min_worker_port is None:
+                raise ValueError(
+                    "If max_worker_port is set, min_worker_port must also be set."
+                )
+            elif self.max_worker_port != 0:
+                if self.max_worker_port < 1024 or self.max_worker_port > 65535:
+                    raise ValueError(
+                        "max_worker_port must be 0 or an integer between "
+                        "1024 and 65535."
+                    )
+                elif self.max_worker_port <= self.min_worker_port:
+                    raise ValueError(
+                        "max_worker_port must be higher than min_worker_port."
+                    )
+        if self.ray_client_server_port is not None:
+            if not check_ray_client_dependencies_installed():
+                raise ValueError(
+                    "Ray Client requires pip package `ray[client]`. "
+                    "If you installed the minimal Ray (e.g. `pip install ray`), "
+                    "please reinstall by executing `pip install ray[client]`."
+                )
+            if (
+                self.ray_client_server_port < 1024
+                or self.ray_client_server_port > 65535
+            ):
+                raise ValueError(
+                    "ray_client_server_port must be an integer "
+                    "between 1024 and 65535."
+                )
+        if self.runtime_env_agent_port is not None:
+            if (
+                self.runtime_env_agent_port < 1024
+                or self.runtime_env_agent_port > 65535
+            ):
+                raise ValueError(
+                    "runtime_env_agent_port must be an integer "
+                    "between 1024 and 65535."
+                )
+        if self.resources is not None:
+            def build_error(resource, alternative):
+                return (
+                    f"{self.resources} -> `{resource}` cannot be a "
+                    "custom resource because it is one of the default resources "
+                    f"({ray_constants.DEFAULT_RESOURCES}). "
+                    f"Use `{alternative}` instead. For example, use `ray start "
+                    f"--{alternative.replace('_', '-')}=1` instead of "
+                    f"`ray start --resources={{'{resource}': 1}}`"
+                )
+            assert "CPU" not in self.resources, build_error("CPU", "num_cpus")
+            assert "GPU" not in self.resources, build_error("GPU", "num_gpus")
+            assert "memory" not in self.resources, build_error("memory", "memory")
+            assert "object_store_memory" not in self.resources, build_error(
+                "object_store_memory", "object_store_memory"
+            )
+        if self.redirect_output is not None:
+            raise DeprecationWarning("The redirect_output argument is deprecated.")
+        if self.temp_dir is not None and not os.path.isabs(self.temp_dir):
+            raise ValueError("temp_dir must be absolute path or None.")
+        validate_node_labels(self.labels)
+    def _format_ports(self, pre_selected_ports):
+        """Format the pre-selected ports information to be more human-readable."""
+        ports = pre_selected_ports.copy()
+        for comp, port_list in ports.items():
+            if len(port_list) == 1:
+                ports[comp] = port_list[0]
+            elif len(port_list) == 0:
+                # Nothing is selected, meaning it will be randomly selected.
+                ports[comp] = "random"
+            elif comp == "worker_ports":
+                min_port = port_list[0]
+                max_port = port_list[len(port_list) - 1]
+                if len(port_list) < 50:
+                    port_range_str = str(port_list)
+                else:
+                    port_range_str = f"from {min_port} to {max_port}"
+                ports[comp] = f"{len(port_list)} ports {port_range_str}"
+        return ports

.venv/lib/python3.11/site-packages/ray/_private/process_watcher.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import asyncio
+import io
+import logging
+import sys
+import os
+from concurrent.futures import ThreadPoolExecutor
+import ray
+from ray.dashboard.consts import _PARENT_DEATH_THREASHOLD
+import ray.dashboard.consts as dashboard_consts
+import ray._private.ray_constants as ray_constants
+from ray._private.utils import run_background_task
+# Import psutil after ray so the packaged version is used.
+import psutil
+logger = logging.getLogger(__name__)
+# TODO: move all consts from dashboard_consts to ray_constants and rename to remove
+# DASHBOARD_ prefixes.
+# Publishes at most this number of lines of Raylet logs, when the Raylet dies
+# unexpectedly.
+_RAYLET_LOG_MAX_PUBLISH_LINES = 20
+# Reads at most this amount of Raylet logs from the tail, for publishing and
+# checking if the Raylet was terminated gracefully.
+_RAYLET_LOG_MAX_TAIL_SIZE = 1 * 1024**2
+try:
+    create_task = asyncio.create_task
+except AttributeError:
+    create_task = asyncio.ensure_future
+def get_raylet_pid():
+    # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is
+    # only used for fate-sharing with the raylet and we need a different
+    # fate-sharing mechanism for Windows anyways.
+    if sys.platform in ["win32", "cygwin"]:
+        return None
+    raylet_pid = int(os.environ["RAY_RAYLET_PID"])
+    assert raylet_pid > 0
+    logger.info("raylet pid is %s", raylet_pid)
+    return raylet_pid
+def create_check_raylet_task(log_dir, gcs_address, parent_dead_callback, loop):
+    """
+    Creates an asyncio task to periodically check if the raylet process is still
+    running. If raylet is dead for _PARENT_DEATH_THREASHOLD (5) times, prepare to exit
+    as follows:
+    - Write logs about whether the raylet exit is graceful, by looking into the raylet
+    log and search for term "SIGTERM",
+    - Flush the logs via GcsPublisher,
+    - Exit.
+    """
+    if sys.platform in ["win32", "cygwin"]:
+        raise RuntimeError("can't check raylet process in Windows.")
+    raylet_pid = get_raylet_pid()
+    if dashboard_consts.PARENT_HEALTH_CHECK_BY_PIPE:
+        logger.info("check_parent_via_pipe")
+        check_parent_task = _check_parent_via_pipe(
+            log_dir, gcs_address, loop, parent_dead_callback
+        )
+    else:
+        logger.info("_check_parent")
+        check_parent_task = _check_parent(
+            raylet_pid, log_dir, gcs_address, parent_dead_callback
+        )
+    return run_background_task(check_parent_task)
+def report_raylet_error_logs(log_dir: str, gcs_address: str):
+    log_path = os.path.join(log_dir, "raylet.out")
+    error = False
+    msg = "Raylet is terminated. "
+    try:
+        with open(log_path, "r", encoding="utf-8") as f:
+            # Seek to _RAYLET_LOG_MAX_TAIL_SIZE from the end if the
+            # file is larger than that.
+            f.seek(0, io.SEEK_END)
+            pos = max(0, f.tell() - _RAYLET_LOG_MAX_TAIL_SIZE)
+            f.seek(pos, io.SEEK_SET)
+            # Read remaining logs by lines.
+            raylet_logs = f.readlines()
+            # Assume the SIGTERM message must exist within the last
+            # _RAYLET_LOG_MAX_TAIL_SIZE of the log file.
+            if any("Raylet received SIGTERM" in line for line in raylet_logs):
+                msg += "Termination is graceful."
+                logger.info(msg)
+            else:
+                msg += (
+                    "Termination is unexpected. Possible reasons "
+                    "include: (1) SIGKILL by the user or system "
+                    "OOM killer, (2) Invalid memory access from "
+                    "Raylet causing SIGSEGV or SIGBUS, "
+                    "(3) Other termination signals. "
+                    f"Last {_RAYLET_LOG_MAX_PUBLISH_LINES} lines "
+                    "of the Raylet logs:\n"
+                )
+                msg += "    " + "    ".join(
+                    raylet_logs[-_RAYLET_LOG_MAX_PUBLISH_LINES:]
+                )
+                error = True
+    except Exception as e:
+        msg += f"Failed to read Raylet logs at {log_path}: {e}!"
+        logger.exception(msg)
+        error = True
+    if error:
+        logger.error(msg)
+        # TODO: switch to async if necessary.
+        ray._private.utils.publish_error_to_driver(
+            ray_constants.RAYLET_DIED_ERROR,
+            msg,
+            gcs_publisher=ray._raylet.GcsPublisher(address=gcs_address),
+        )
+    else:
+        logger.info(msg)
+async def _check_parent_via_pipe(
+    log_dir: str, gcs_address: str, loop, parent_dead_callback
+):
+    while True:
+        try:
+            # Read input asynchronously.
+            # The parent (raylet) should have redirected its pipe
+            # to stdin. If we read 0 bytes from stdin, it means
+            # the process is dead.
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                input_data = await loop.run_in_executor(
+                    executor, lambda: sys.stdin.readline()
+                )
+            if len(input_data) == 0:
+                # cannot read bytes from parent == parent is dead.
+                parent_dead_callback("_check_parent_via_pipe: The parent is dead.")
+                report_raylet_error_logs(log_dir, gcs_address)
+                sys.exit(0)
+        except Exception as e:
+            logger.exception(
+                "raylet health checking is failed. "
+                f"The agent process may leak. Exception: {e}"
+            )
+async def _check_parent(raylet_pid, log_dir, gcs_address, parent_dead_callback):
+    """Check if raylet is dead and fate-share if it is."""
+    try:
+        curr_proc = psutil.Process()
+        parent_death_cnt = 0
+        while True:
+            parent = curr_proc.parent()
+            # If the parent is dead, it is None.
+            parent_gone = parent is None
+            init_assigned_for_parent = False
+            parent_changed = False
+            if parent:
+                # Sometimes, the parent is changed to the `init` process.
+                # In this case, the parent.pid is 1.
+                init_assigned_for_parent = parent.pid == 1
+                # Sometimes, the parent is dead, and the pid is reused
+                # by other processes. In this case, this condition is triggered.
+                parent_changed = raylet_pid != parent.pid
+            if parent_gone or init_assigned_for_parent or parent_changed:
+                parent_death_cnt += 1
+                logger.warning(
+                    f"Raylet is considered dead {parent_death_cnt} X. "
+                    f"If it reaches to {_PARENT_DEATH_THREASHOLD}, the agent "
+                    f"will kill itself. Parent: {parent}, "
+                    f"parent_gone: {parent_gone}, "
+                    f"init_assigned_for_parent: {init_assigned_for_parent}, "
+                    f"parent_changed: {parent_changed}."
+                )
+                if parent_death_cnt < _PARENT_DEATH_THREASHOLD:
+                    await asyncio.sleep(
+                        dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S
+                    )
+                    continue
+                parent_dead_callback("_check_parent: The parent is dead.")
+                report_raylet_error_logs(log_dir, gcs_address)
+                sys.exit(0)
+            else:
+                parent_death_cnt = 0
+            await asyncio.sleep(
+                dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S
+            )
+    except Exception:
+        logger.exception("Failed to check parent PID, exiting.")
+        sys.exit(1)

.venv/lib/python3.11/site-packages/ray/_private/profiling.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import os
+import json
+from collections import defaultdict
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Union
+import ray
+class _NullLogSpan:
+    """A log span context manager that does nothing"""
+    def __enter__(self):
+        pass
+    def __exit__(self, type, value, tb):
+        pass
+PROFILING_ENABLED = "RAY_PROFILING" in os.environ
+NULL_LOG_SPAN = _NullLogSpan()
+# Colors are specified at
+# https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html.  # noqa: E501
+_default_color_mapping = defaultdict(
+    lambda: "generic_work",
+    {
+        "worker_idle": "cq_build_abandoned",
+        "task": "rail_response",
+        "task:deserialize_arguments": "rail_load",
+        "task:execute": "rail_animation",
+        "task:store_outputs": "rail_idle",
+        "wait_for_function": "detailed_memory_dump",
+        "ray.get": "good",
+        "ray.put": "terrible",
+        "ray.wait": "vsync_highlight_color",
+        "submit_task": "background_memory_dump",
+        "fetch_and_run_function": "detailed_memory_dump",
+        "register_remote_function": "detailed_memory_dump",
+    },
+)
+@dataclass(init=True)
+class ChromeTracingCompleteEvent:
+    # https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#heading=h.lpfof2aylapb # noqa
+    # The event categories. This is a comma separated list of categories
+    # for the event. The categories can be used to hide events in
+    # the Trace Viewer UI.
+    cat: str
+    # The string displayed on the event.
+    name: str
+    # The identifier for the group of rows that the event
+    # appears in.
+    pid: int
+    # The identifier for the row that the event appears in.
+    tid: int
+    # The start time in microseconds.
+    ts: int
+    # The duration in microseconds.
+    dur: int
+    # This is the name of the color to display the box in.
+    cname: str
+    # The extra user-defined data.
+    args: Dict[str, Union[str, int]]
+    # The event type (X means the complete event).
+    ph: str = "X"
+@dataclass(init=True)
+class ChromeTracingMetadataEvent:
+    # https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#bookmark=id.iycbnb4z7i9g # noqa
+    name: str
+    # Metadata arguments. E.g., name: <metadata_name>
+    args: Dict[str, str]
+    # The process id of this event. In Ray, pid indicates the node.
+    pid: int
+    # The thread id of this event. In Ray, tid indicates each worker.
+    tid: int = None
+    # M means the metadata event.
+    ph: str = "M"
+def profile(event_type, extra_data=None):
+    """Profile a span of time so that it appears in the timeline visualization.
+    Note that this only works in the raylet code path.
+    This function can be used as follows (both on the driver or within a task).
+    .. testcode::
+        import ray._private.profiling as profiling
+        with profiling.profile("custom event", extra_data={'key': 'val'}):
+            # Do some computation here.
+            x = 1 * 2
+    Optionally, a dictionary can be passed as the "extra_data" argument, and
+    it can have keys "name" and "cname" if you want to override the default
+    timeline display text and box color. Other values will appear at the bottom
+    of the chrome tracing GUI when you click on the box corresponding to this
+    profile span.
+    Args:
+        event_type: A string describing the type of the event.
+        extra_data: This must be a dictionary mapping strings to strings. This
+            data will be added to the json objects that are used to populate
+            the timeline, so if you want to set a particular color, you can
+            simply set the "cname" attribute to an appropriate color.
+            Similarly, if you set the "name" attribute, then that will set the
+            text displayed on the box in the timeline.
+    Returns:
+        An object that can profile a span of time via a "with" statement.
+    """
+    if not PROFILING_ENABLED:
+        return NULL_LOG_SPAN
+    worker = ray._private.worker.global_worker
+    if worker.mode == ray._private.worker.LOCAL_MODE:
+        return NULL_LOG_SPAN
+    return worker.core_worker.profile_event(event_type.encode("ascii"), extra_data)
+def chrome_tracing_dump(
+    tasks: List[dict],
+) -> str:
+    """Generate a chrome/perfetto tracing dump using task events.
+    Args:
+        tasks: List of tasks generated by a state API list_tasks(detail=True).
+    Returns:
+        Json serialized dump to create a chrome/perfetto tracing.
+    """
+    # All events from given tasks.
+    all_events = []
+    # Chrome tracing doesn't have a concept of "node". Instead, we use
+    # chrome tracing's pid == ray's node.
+    # chrome tracing's tid == ray's process.
+    # Note that pid or tid is usually integer, but ray's node/process has
+    # ids in string.
+    # Unfortunately, perfetto doesn't allow to have string as a value of pid/tid.
+    # To workaround it, we use Metadata event from chrome tracing schema
+    # (https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#heading=h.xqopa5m0e28f) # noqa
+    # which allows pid/tid -> name mapping. In order to use this schema
+    # we build node_ip/(node_ip, worker_id) -> arbitrary index mapping.
+    # node ip address -> node idx.
+    node_to_index = {}
+    # Arbitrary index mapped to the ip address.
+    node_idx = 0
+    # (node index, worker id) -> worker idx
+    worker_to_index = {}
+    # Arbitrary index mapped to the (node index, worker id).
+    worker_idx = 0
+    for task in tasks:
+        profiling_data = task.get("profiling_data", [])
+        if profiling_data:
+            node_ip_address = profiling_data["node_ip_address"]
+            component_events = profiling_data["events"]
+            component_type = profiling_data["component_type"]
+            component_id = component_type + ":" + profiling_data["component_id"]
+            if component_type not in ["worker", "driver"]:
+                continue
+            for event in component_events:
+                extra_data = event["extra_data"]
+                # Propagate extra data.
+                extra_data["task_id"] = task["task_id"]
+                extra_data["job_id"] = task["job_id"]
+                extra_data["attempt_number"] = task["attempt_number"]
+                extra_data["func_or_class_name"] = task["func_or_class_name"]
+                extra_data["actor_id"] = task["actor_id"]
+                event_name = event["event_name"]
+                # build a id -> arbitrary index mapping
+                if node_ip_address not in node_to_index:
+                    node_to_index[node_ip_address] = node_idx
+                    # Whenever new node ip is introduced, we increment the index.
+                    node_idx += 1
+                if (
+                    node_to_index[node_ip_address],
+                    component_id,
+                ) not in worker_to_index:  # noqa
+                    worker_to_index[
+                        (node_to_index[node_ip_address], component_id)
+                    ] = worker_idx  # noqa
+                    worker_idx += 1
+                # Modify the name with the additional user-defined extra data.
+                cname = _default_color_mapping[event["event_name"]]
+                name = event_name
+                if "cname" in extra_data:
+                    cname = _default_color_mapping[event["extra_data"]["cname"]]
+                if "name" in extra_data:
+                    name = extra_data["name"]
+                new_event = ChromeTracingCompleteEvent(
+                    cat=event_name,
+                    name=name,
+                    pid=node_to_index[node_ip_address],
+                    tid=worker_to_index[(node_to_index[node_ip_address], component_id)],
+                    ts=event["start_time"] * 1e3,
+                    dur=(event["end_time"] * 1e3) - (event["start_time"] * 1e3),
+                    cname=cname,
+                    args=extra_data,
+                )
+                all_events.append(asdict(new_event))
+    for node, i in node_to_index.items():
+        all_events.append(
+            asdict(
+                ChromeTracingMetadataEvent(
+                    name="process_name",
+                    pid=i,
+                    args={"name": f"Node {node}"},
+                )
+            )
+        )
+    for worker, i in worker_to_index.items():
+        all_events.append(
+            asdict(
+                ChromeTracingMetadataEvent(
+                    name="thread_name",
+                    ph="M",
+                    tid=i,
+                    pid=worker[0],
+                    args={"name": worker[1]},
+                )
+            )
+        )
+    # Handle task event disabled.
+    return json.dumps(all_events)

.venv/lib/python3.11/site-packages/ray/_private/prometheus_exporter.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# NOTE: This file has been copied from OpenCensus Python exporter.
+# It is because OpenCensus Prometheus exporter hasn't released for a while
+# and the latest version has a compatibility issue with the latest OpenCensus
+# library.
+import re
+from prometheus_client import start_http_server
+from prometheus_client.core import (
+    REGISTRY,
+    CounterMetricFamily,
+    GaugeMetricFamily,
+    HistogramMetricFamily,
+    UnknownMetricFamily,
+)
+from opencensus.common.transports import sync
+from opencensus.stats import aggregation_data as aggregation_data_module
+from opencensus.stats import base_exporter
+import logging
+logger = logging.getLogger(__name__)
+class Options(object):
+    """Options contains options for configuring the exporter.
+    The address can be empty as the prometheus client will
+    assume it's localhost
+    :type namespace: str
+    :param namespace: The prometheus namespace to be used. Defaults to ''.
+    :type port: int
+    :param port: The Prometheus port to be used. Defaults to 8000.
+    :type address: str
+    :param address: The Prometheus address to be used. Defaults to ''.
+    :type registry: registry
+    :param registry: The Prometheus address to be used. Defaults to ''.
+    :type registry: :class:`~prometheus_client.core.CollectorRegistry`
+    :param registry: A Prometheus collector registry instance.
+    """
+    def __init__(self, namespace="", port=8000, address="", registry=REGISTRY):
+        self._namespace = namespace
+        self._registry = registry
+        self._port = int(port)
+        self._address = address
+    @property
+    def registry(self):
+        """Prometheus Collector Registry instance"""
+        return self._registry
+    @property
+    def namespace(self):
+        """Prefix to be used with view name"""
+        return self._namespace
+    @property
+    def port(self):
+        """Port number to listen"""
+        return self._port
+    @property
+    def address(self):
+        """Endpoint address (default is localhost)"""
+        return self._address
+class Collector(object):
+    """Collector represents the Prometheus Collector object"""
+    def __init__(self, options=Options(), view_name_to_data_map=None):
+        if view_name_to_data_map is None:
+            view_name_to_data_map = {}
+        self._options = options
+        self._registry = options.registry
+        self._view_name_to_data_map = view_name_to_data_map
+        self._registered_views = {}
+    @property
+    def options(self):
+        """Options to be used to configure the exporter"""
+        return self._options
+    @property
+    def registry(self):
+        """Prometheus Collector Registry instance"""
+        return self._registry
+    @property
+    def view_name_to_data_map(self):
+        """Map with all view data objects
+        that will be sent to Prometheus
+        """
+        return self._view_name_to_data_map
+    @property
+    def registered_views(self):
+        """Map with all registered views"""
+        return self._registered_views
+    def register_view(self, view):
+        """register_view will create the needed structure
+        in order to be able to sent all data to Prometheus
+        """
+        v_name = get_view_name(self.options.namespace, view)
+        if v_name not in self.registered_views:
+            desc = {
+                "name": v_name,
+                "documentation": view.description,
+                "labels": list(map(sanitize, view.columns)),
+                "units": view.measure.unit,
+            }
+            self.registered_views[v_name] = desc
+    def add_view_data(self, view_data):
+        """Add view data object to be sent to server"""
+        self.register_view(view_data.view)
+        v_name = get_view_name(self.options.namespace, view_data.view)
+        self.view_name_to_data_map[v_name] = view_data
+    # TODO: add start and end timestamp
+    def to_metric(self, desc, tag_values, agg_data, metrics_map):
+        """to_metric translate the data that OpenCensus create
+        to Prometheus format, using Prometheus Metric object
+        :type desc: dict
+        :param desc: The map that describes view definition
+        :type tag_values: tuple of :class:
+            `~opencensus.tags.tag_value.TagValue`
+        :param object of opencensus.tags.tag_value.TagValue:
+            TagValue object used as label values
+        :type agg_data: object of :class:
+            `~opencensus.stats.aggregation_data.AggregationData`
+        :param object of opencensus.stats.aggregation_data.AggregationData:
+            Aggregated data that needs to be converted as Prometheus samples
+        :rtype: :class:`~prometheus_client.core.CounterMetricFamily` or
+                :class:`~prometheus_client.core.HistogramMetricFamily` or
+                :class:`~prometheus_client.core.UnknownMetricFamily` or
+                :class:`~prometheus_client.core.GaugeMetricFamily`
+        """
+        metric_name = desc["name"]
+        metric_description = desc["documentation"]
+        label_keys = desc["labels"]
+        metric_units = desc["units"]
+        assert len(tag_values) == len(label_keys), (tag_values, label_keys)
+        # Prometheus requires that all tag values be strings hence
+        # the need to cast none to the empty string before exporting. See
+        # https://github.com/census-instrumentation/opencensus-python/issues/480
+        tag_values = [tv if tv else "" for tv in tag_values]
+        if isinstance(agg_data, aggregation_data_module.CountAggregationData):
+            metric = metrics_map.get(metric_name)
+            if not metric:
+                metric = CounterMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    unit=metric_units,
+                    labels=label_keys,
+                )
+                metrics_map[metric_name] = metric
+            metric.add_metric(labels=tag_values, value=agg_data.count_data)
+            return
+        elif isinstance(agg_data, aggregation_data_module.DistributionAggregationData):
+            assert agg_data.bounds == sorted(agg_data.bounds)
+            # buckets are a list of buckets. Each bucket is another list with
+            # a pair of bucket name and value, or a triple of bucket name,
+            # value, and exemplar. buckets need to be in order.
+            buckets = []
+            cum_count = 0  # Prometheus buckets expect cumulative count.
+            for ii, bound in enumerate(agg_data.bounds):
+                cum_count += agg_data.counts_per_bucket[ii]
+                bucket = [str(bound), cum_count]
+                buckets.append(bucket)
+            # Prometheus requires buckets to be sorted, and +Inf present.
+            # In OpenCensus we don't have +Inf in the bucket bonds so need to
+            # append it here.
+            buckets.append(["+Inf", agg_data.count_data])
+            metric = metrics_map.get(metric_name)
+            if not metric:
+                metric = HistogramMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    labels=label_keys,
+                )
+                metrics_map[metric_name] = metric
+            metric.add_metric(
+                labels=tag_values,
+                buckets=buckets,
+                sum_value=agg_data.sum,
+            )
+            return
+        elif isinstance(agg_data, aggregation_data_module.SumAggregationData):
+            metric = metrics_map.get(metric_name)
+            if not metric:
+                metric = UnknownMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    labels=label_keys,
+                )
+                metrics_map[metric_name] = metric
+            metric.add_metric(labels=tag_values, value=agg_data.sum_data)
+            return
+        elif isinstance(agg_data, aggregation_data_module.LastValueAggregationData):
+            metric = metrics_map.get(metric_name)
+            if not metric:
+                metric = GaugeMetricFamily(
+                    name=metric_name,
+                    documentation=metric_description,
+                    labels=label_keys,
+                )
+                metrics_map[metric_name] = metric
+            metric.add_metric(labels=tag_values, value=agg_data.value)
+            return
+        else:
+            raise ValueError(f"unsupported aggregation type {type(agg_data)}")
+    def collect(self):  # pragma: NO COVER
+        """Collect fetches the statistics from OpenCensus
+        and delivers them as Prometheus Metrics.
+        Collect is invoked every time a prometheus.Gatherer is run
+        for example when the HTTP endpoint is invoked by Prometheus.
+        """
+        # Make a shallow copy of self._view_name_to_data_map, to avoid seeing
+        # concurrent modifications when iterating through the dictionary.
+        metrics_map = {}
+        for v_name, view_data in self._view_name_to_data_map.copy().items():
+            if v_name not in self.registered_views:
+                continue
+            desc = self.registered_views[v_name]
+            for tag_values in view_data.tag_value_aggregation_data_map:
+                agg_data = view_data.tag_value_aggregation_data_map[tag_values]
+                self.to_metric(desc, tag_values, agg_data, metrics_map)
+        for metric in metrics_map.values():
+            yield metric
+class PrometheusStatsExporter(base_exporter.StatsExporter):
+    """Exporter exports stats to Prometheus, users need
+        to register the exporter as an HTTP Handler to be
+        able to export.
+    :type options:
+        :class:`~opencensus.ext.prometheus.stats_exporter.Options`
+    :param options: An options object with the parameters to instantiate the
+                         prometheus exporter.
+    :type gatherer: :class:`~prometheus_client.core.CollectorRegistry`
+    :param gatherer: A Prometheus collector registry instance.
+    :type transport:
+        :class:`opencensus.common.transports.sync.SyncTransport` or
+        :class:`opencensus.common.transports.async_.AsyncTransport`
+    :param transport: An instance of a Transpor to send data with.
+    :type collector:
+        :class:`~opencensus.ext.prometheus.stats_exporter.Collector`
+    :param collector: An instance of the Prometheus Collector object.
+    """
+    def __init__(
+        self, options, gatherer, transport=sync.SyncTransport, collector=Collector()
+    ):
+        self._options = options
+        self._gatherer = gatherer
+        self._collector = collector
+        self._transport = transport(self)
+        self.serve_http()
+        REGISTRY.register(self._collector)
+    @property
+    def transport(self):
+        """The transport way to be sent data to server
+        (default is sync).
+        """
+        return self._transport
+    @property
+    def collector(self):
+        """Collector class instance to be used
+        to communicate with Prometheus
+        """
+        return self._collector
+    @property
+    def gatherer(self):
+        """Prometheus Collector Registry instance"""
+        return self._gatherer
+    @property
+    def options(self):
+        """Options to be used to configure the exporter"""
+        return self._options
+    def export(self, view_data):
+        """export send the data to the transport class
+        in order to be sent to Prometheus in a sync or async way.
+        """
+        if view_data is not None:  # pragma: NO COVER
+            self.transport.export(view_data)
+    def on_register_view(self, view):
+        return NotImplementedError("Not supported by Prometheus")
+    def emit(self, view_data):  # pragma: NO COVER
+        """Emit exports to the Prometheus if view data has one or more rows.
+        Each OpenCensus AggregationData will be converted to
+        corresponding Prometheus Metric: SumData will be converted
+        to Untyped Metric, CountData will be a Counter Metric
+        DistributionData will be a Histogram Metric.
+        """
+        for v_data in view_data:
+            if v_data.tag_value_aggregation_data_map is None:
+                v_data.tag_value_aggregation_data_map = {}
+            self.collector.add_view_data(v_data)
+    def serve_http(self):
+        """serve_http serves the Prometheus endpoint."""
+        address = str(self.options.address)
+        kwargs = {"addr": address} if address else {}
+        start_http_server(port=self.options.port, **kwargs)
+def new_stats_exporter(option):
+    """new_stats_exporter returns an exporter
+    that exports stats to Prometheus.
+    """
+    if option.namespace == "":
+        raise ValueError("Namespace can not be empty string.")
+    collector = new_collector(option)
+    exporter = PrometheusStatsExporter(
+        options=option, gatherer=option.registry, collector=collector
+    )
+    return exporter
+def new_collector(options):
+    """new_collector should be used
+    to create instance of Collector class in order to
+    prevent the usage of constructor directly
+    """
+    return Collector(options=options)
+def get_view_name(namespace, view):
+    """create the name for the view"""
+    name = ""
+    if namespace != "":
+        name = namespace + "_"
+    return sanitize(name + view.name)
+_NON_LETTERS_NOR_DIGITS_RE = re.compile(r"[^\w]", re.UNICODE | re.IGNORECASE)
+def sanitize(key):
+    """sanitize the given metric name or label according to Prometheus rule.
+    Replace all characters other than [A-Za-z0-9_] with '_'.
+    """
+    return _NON_LETTERS_NOR_DIGITS_RE.sub("_", key)

.venv/lib/python3.11/site-packages/ray/_private/protobuf_compat.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from google.protobuf.json_format import MessageToDict
+import inspect
+"""
+This module provides a compatibility layer for different versions of the protobuf
+library.
+"""
+_protobuf_has_old_arg_name_cached = None
+def _protobuf_has_old_arg_name():
+    """Cache the inspect result to avoid doing it for every single message."""
+    global _protobuf_has_old_arg_name_cached
+    if _protobuf_has_old_arg_name_cached is None:
+        params = inspect.signature(MessageToDict).parameters
+        _protobuf_has_old_arg_name_cached = "including_default_value_fields" in params
+    return _protobuf_has_old_arg_name_cached
+def rename_always_print_fields_with_no_presence(kwargs):
+    """
+    Protobuf version 5.26.0rc2 renamed argument for `MessageToDict`:
+    `including_default_value_fields` -> `always_print_fields_with_no_presence`.
+    See https://github.com/protocolbuffers/protobuf/commit/06e7caba58ede0220b110b89d08f329e5f8a7537#diff-8de817c14d6a087981503c9aea38730b1b3e98f4e306db5ff9d525c7c304f234L129  # noqa: E501
+    We choose to always use the new argument name. If user used the old arg, we raise an
+    error.
+    If protobuf does not have the new arg name but have the old arg name, we rename our
+    arg to the old one.
+    """
+    old_arg_name = "including_default_value_fields"
+    new_arg_name = "always_print_fields_with_no_presence"
+    if old_arg_name in kwargs:
+        raise ValueError(f"{old_arg_name} is deprecated, please use {new_arg_name}")
+    if new_arg_name in kwargs and _protobuf_has_old_arg_name():
+        kwargs[old_arg_name] = kwargs.pop(new_arg_name)
+    return kwargs
+def message_to_dict(*args, **kwargs):
+    kwargs = rename_always_print_fields_with_no_presence(kwargs)
+    return MessageToDict(*args, **kwargs)

.venv/lib/python3.11/site-packages/ray/_private/pydantic_compat.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# ruff: noqa
+import packaging.version
+# Pydantic is a dependency of `ray["default"]` but not the minimal installation,
+# so handle the case where it isn't installed.
+try:
+    import pydantic
+    PYDANTIC_INSTALLED = True
+except ImportError:
+    pydantic = None
+    PYDANTIC_INSTALLED = False
+if not PYDANTIC_INSTALLED:
+    IS_PYDANTIC_2 = False
+    BaseModel = None
+    Extra = None
+    Field = None
+    NonNegativeFloat = None
+    NonNegativeInt = None
+    PositiveFloat = None
+    PositiveInt = None
+    PrivateAttr = None
+    StrictInt = None
+    ValidationError = None
+    root_validator = None
+    validator = None
+    is_subclass_of_base_model = lambda obj: False
+# In pydantic <1.9.0, __version__ attribute is missing, issue ref:
+# https://github.com/pydantic/pydantic/issues/2572, so we need to check
+# the existence prior to comparison.
+elif not hasattr(pydantic, "__version__") or packaging.version.parse(
+    pydantic.__version__
+) < packaging.version.parse("2.0"):
+    IS_PYDANTIC_2 = False
+    from pydantic import (
+        BaseModel,
+        Extra,
+        Field,
+        NonNegativeFloat,
+        NonNegativeInt,
+        PositiveFloat,
+        PositiveInt,
+        PrivateAttr,
+        StrictInt,
+        ValidationError,
+        root_validator,
+        validator,
+    )
+    def is_subclass_of_base_model(obj):
+        return issubclass(obj, BaseModel)
+else:
+    IS_PYDANTIC_2 = True
+    from pydantic.v1 import (
+        BaseModel,
+        Extra,
+        Field,
+        NonNegativeFloat,
+        NonNegativeInt,
+        PositiveFloat,
+        PositiveInt,
+        PrivateAttr,
+        StrictInt,
+        ValidationError,
+        root_validator,
+        validator,
+    )
+    def is_subclass_of_base_model(obj):
+        from pydantic import BaseModel as BaseModelV2
+        from pydantic.v1 import BaseModel as BaseModelV1
+        return issubclass(obj, BaseModelV1) or issubclass(obj, BaseModelV2)
+def register_pydantic_serializers(serialization_context):
+    if not PYDANTIC_INSTALLED:
+        return
+    if IS_PYDANTIC_2:
+        # TODO(edoakes): compare against the version that has the fixes.
+        from pydantic.v1.fields import ModelField
+    else:
+        from pydantic.fields import ModelField
+    # Pydantic's Cython validators are not serializable.
+    # https://github.com/cloudpipe/cloudpickle/issues/408
+    serialization_context._register_cloudpickle_serializer(
+        ModelField,
+        custom_serializer=lambda o: {
+            "name": o.name,
+            # outer_type_ is the original type for ModelFields,
+            # while type_ can be updated later with the nested type
+            # like int for List[int].
+            "type_": o.outer_type_,
+            "class_validators": o.class_validators,
+            "model_config": o.model_config,
+            "default": o.default,
+            "default_factory": o.default_factory,
+            "required": o.required,
+            "alias": o.alias,
+            "field_info": o.field_info,
+        },
+        custom_deserializer=lambda kwargs: ModelField(**kwargs),
+    )

.venv/lib/python3.11/site-packages/ray/_private/ray_client_microbenchmark.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import inspect
+import logging
+import numpy as np
+import sys
+from ray.util.client.ray_client_helpers import ray_start_client_server
+from ray._private.ray_microbenchmark_helpers import timeit
+def benchmark_get_calls(ray, results):
+    value = ray.put(0)
+    def get_small():
+        ray.get(value)
+    results += timeit("client: get calls", get_small)
+def benchmark_tasks_and_get_batch(ray, results):
+    @ray.remote
+    def small_value():
+        return b"ok"
+    def small_value_batch():
+        submitted = [small_value.remote() for _ in range(1000)]
+        ray.get(submitted)
+        return 0
+    results += timeit("client: tasks and get batch", small_value_batch)
+def benchmark_put_calls(ray, results):
+    def put_small():
+        ray.put(0)
+    results += timeit("client: put calls", put_small)
+def benchmark_remote_put_calls(ray, results):
+    @ray.remote
+    def do_put_small():
+        for _ in range(100):
+            ray.put(0)
+    def put_multi_small():
+        ray.get([do_put_small.remote() for _ in range(10)])
+    results += timeit("client: tasks and put batch", put_multi_small, 1000)
+def benchmark_put_large(ray, results):
+    arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
+    def put_large():
+        ray.put(arr)
+    results += timeit("client: put gigabytes", put_large, 8 * 0.1)
+def benchmark_simple_actor(ray, results):
+    @ray.remote(num_cpus=0)
+    class Actor:
+        def small_value(self):
+            return b"ok"
+        def small_value_arg(self, x):
+            return b"ok"
+        def small_value_batch(self, n):
+            ray.get([self.small_value.remote() for _ in range(n)])
+    a = Actor.remote()
+    def actor_sync():
+        ray.get(a.small_value.remote())
+    results += timeit("client: 1:1 actor calls sync", actor_sync)
+    def actor_async():
+        ray.get([a.small_value.remote() for _ in range(1000)])
+    results += timeit("client: 1:1 actor calls async", actor_async, 1000)
+    a = Actor.options(max_concurrency=16).remote()
+    def actor_concurrent():
+        ray.get([a.small_value.remote() for _ in range(1000)])
+    results += timeit("client: 1:1 actor calls concurrent", actor_concurrent, 1000)
+def main(results=None):
+    results = results or []
+    ray_config = {"logging_level": logging.WARNING}
+    def ray_connect_handler(job_config=None, **ray_init_kwargs):
+        from ray._private.client_mode_hook import disable_client_hook
+        with disable_client_hook():
+            import ray as real_ray
+            if not real_ray.is_initialized():
+                real_ray.init(**ray_config)
+    for name, obj in inspect.getmembers(sys.modules[__name__]):
+        if not name.startswith("benchmark_"):
+            continue
+        with ray_start_client_server(ray_connect_handler=ray_connect_handler) as ray:
+            obj(ray, results)
+    return results
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/_private/ray_cluster_perf.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""This is the script for `ray clusterbenchmark`."""
+import time
+import numpy as np
+import ray
+from ray.cluster_utils import Cluster
+def main():
+    cluster = Cluster(
+        initialize_head=True,
+        connect=True,
+        head_node_args={"object_store_memory": 20 * 1024 * 1024 * 1024, "num_cpus": 16},
+    )
+    cluster.add_node(
+        object_store_memory=20 * 1024 * 1024 * 1024, num_gpus=1, num_cpus=16
+    )
+    object_ref_list = []
+    for i in range(0, 10):
+        object_ref = ray.put(np.random.rand(1024 * 128, 1024))
+        object_ref_list.append(object_ref)
+    @ray.remote(num_gpus=1)
+    def f(object_ref_list):
+        diffs = []
+        for object_ref in object_ref_list:
+            before = time.time()
+            ray.get(object_ref)
+            after = time.time()
+            diffs.append(after - before)
+            time.sleep(1)
+        return np.mean(diffs), np.std(diffs)
+    time_diff, time_diff_std = ray.get(f.remote(object_ref_list))
+    print(
+        "latency to get an 1G object over network",
+        round(time_diff, 2),
+        "+-",
+        round(time_diff_std, 2),
+    )
+    ray.shutdown()
+    cluster.shutdown()
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/_private/ray_constants.py ADDED Viewed

	@@ -0,0 +1,554 @@

+"""Ray constants used in the Python code."""
+import logging
+import os
+import sys
+import json
+logger = logging.getLogger(__name__)
+def env_integer(key, default):
+    if key in os.environ:
+        value = os.environ[key]
+        if value.isdigit():
+            return int(os.environ[key])
+        logger.debug(
+            f"Found {key} in environment, but value must "
+            f"be an integer. Got: {value}. Returning "
+            f"provided default {default}."
+        )
+        return default
+    return default
+def env_float(key, default):
+    if key in os.environ:
+        value = os.environ[key]
+        try:
+            return float(value)
+        except ValueError:
+            logger.debug(
+                f"Found {key} in environment, but value must "
+                f"be a float. Got: {value}. Returning "
+                f"provided default {default}."
+            )
+            return default
+    return default
+def env_bool(key, default):
+    if key in os.environ:
+        return (
+            True
+            if os.environ[key].lower() == "true" or os.environ[key] == "1"
+            else False
+        )
+    return default
+def env_set_by_user(key):
+    return key in os.environ
+# Whether event logging to driver is enabled. Set to 0 to disable.
+AUTOSCALER_EVENTS = env_integer("RAY_SCHEDULER_EVENTS", 1)
+RAY_LOG_TO_DRIVER = env_bool("RAY_LOG_TO_DRIVER", True)
+# Filter level under which events will be filtered out, i.e. not printing to driver
+RAY_LOG_TO_DRIVER_EVENT_LEVEL = os.environ.get("RAY_LOG_TO_DRIVER_EVENT_LEVEL", "INFO")
+# Internal kv keys for storing monitor debug status.
+DEBUG_AUTOSCALING_ERROR = "__autoscaling_error"
+DEBUG_AUTOSCALING_STATUS = "__autoscaling_status"
+DEBUG_AUTOSCALING_STATUS_LEGACY = "__autoscaling_status_legacy"
+ID_SIZE = 28
+# The default maximum number of bytes to allocate to the object store unless
+# overridden by the user.
+DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = env_integer(
+    "RAY_DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES", 200 * 10**9  # 200 GB
+)
+# The default proportion of available memory allocated to the object store
+DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = env_float(
+    "RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION", 0.3
+)
+# The smallest cap on the memory used by the object store that we allow.
+# This must be greater than MEMORY_RESOURCE_UNIT_BYTES
+OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
+# Each ObjectRef currently uses about 3KB of caller memory.
+CALLER_MEMORY_USAGE_PER_OBJECT_REF = 3000
+# Match max_direct_call_object_size in
+# src/ray/common/ray_config_def.h.
+# TODO(swang): Ideally this should be pulled directly from the
+# config in case the user overrides it.
+DEFAULT_MAX_DIRECT_CALL_OBJECT_SIZE = 100 * 1024
+# The default maximum number of bytes that the non-primary Redis shards are
+# allowed to use unless overridden by the user.
+DEFAULT_REDIS_MAX_MEMORY_BYTES = 10**10
+# The smallest cap on the memory used by Redis that we allow.
+REDIS_MINIMUM_MEMORY_BYTES = 10**7
+# Above this number of bytes, raise an error by default unless the user sets
+# RAY_ALLOW_SLOW_STORAGE=1. This avoids swapping with large object stores.
+REQUIRE_SHM_SIZE_THRESHOLD = 10**10
+# Mac with 16GB memory has degraded performance when the object store size is
+# greater than 2GB.
+# (see https://github.com/ray-project/ray/issues/20388 for details)
+# The workaround here is to limit capacity to 2GB for Mac by default,
+# and raise error if the capacity is overwritten by user.
+MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT = 2 * 2**30
+# If a user does not specify a port for the primary Ray service,
+# we attempt to start the service running at this port.
+DEFAULT_PORT = 6379
+RAY_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_ADDRESS"
+RAY_NAMESPACE_ENVIRONMENT_VARIABLE = "RAY_NAMESPACE"
+RAY_RUNTIME_ENV_ENVIRONMENT_VARIABLE = "RAY_RUNTIME_ENV"
+RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR = (
+    "RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S"
+)
+# Ray populates this env var to the working dir in the creation of a runtime env.
+# For example, `pip` and `conda` users can use this environment variable to locate the
+# `requirements.txt` file.
+RAY_RUNTIME_ENV_CREATE_WORKING_DIR_ENV_VAR = "RAY_RUNTIME_ENV_CREATE_WORKING_DIR"
+# Defaults to 10 minutes. This should be longer than the total time it takes for
+# the local working_dir and py_modules to be uploaded, or these files might get
+# garbage collected before the job starts.
+RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT = 10 * 60
+# If set to 1, then `.gitignore` files will not be parsed and loaded into "excludes"
+# when using a local working_dir or py_modules.
+RAY_RUNTIME_ENV_IGNORE_GITIGNORE = "RAY_RUNTIME_ENV_IGNORE_GITIGNORE"
+RAY_STORAGE_ENVIRONMENT_VARIABLE = "RAY_STORAGE"
+# Hook for running a user-specified runtime-env hook. This hook will be called
+# unconditionally given the runtime_env dict passed for ray.init. It must return
+# a rewritten runtime_env dict. Example: "your.module.runtime_env_hook".
+RAY_RUNTIME_ENV_HOOK = "RAY_RUNTIME_ENV_HOOK"
+# Hook that is invoked on `ray start`. It will be given the cluster parameters and
+# whether we are the head node as arguments. The function can modify the params class,
+# but otherwise returns void. Example: "your.module.ray_start_hook".
+RAY_START_HOOK = "RAY_START_HOOK"
+# Hook that is invoked on `ray job submit`. It will be given all the same args as the
+# job.cli.submit() function gets, passed as kwargs to this function.
+RAY_JOB_SUBMIT_HOOK = "RAY_JOB_SUBMIT_HOOK"
+# Headers to pass when using the Job CLI. It will be given to
+# instantiate a Job SubmissionClient.
+RAY_JOB_HEADERS = "RAY_JOB_HEADERS"
+DEFAULT_DASHBOARD_IP = "127.0.0.1"
+DEFAULT_DASHBOARD_PORT = 8265
+DASHBOARD_ADDRESS = "dashboard"
+PROMETHEUS_SERVICE_DISCOVERY_FILE = "prom_metrics_service_discovery.json"
+DEFAULT_DASHBOARD_AGENT_LISTEN_PORT = 52365
+# Default resource requirements for actors when no resource requirements are
+# specified.
+DEFAULT_ACTOR_METHOD_CPU_SIMPLE = 1
+DEFAULT_ACTOR_CREATION_CPU_SIMPLE = 0
+# Default resource requirements for actors when some resource requirements are
+# specified in .
+DEFAULT_ACTOR_METHOD_CPU_SPECIFIED = 0
+DEFAULT_ACTOR_CREATION_CPU_SPECIFIED = 1
+# Default number of return values for each actor method.
+DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS = 1
+# Wait 30 seconds for client to reconnect after unexpected disconnection
+DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD = 30
+# If a remote function or actor (or some other export) has serialized size
+# greater than this quantity, print an warning.
+FUNCTION_SIZE_WARN_THRESHOLD = 10**7
+FUNCTION_SIZE_ERROR_THRESHOLD = env_integer("FUNCTION_SIZE_ERROR_THRESHOLD", (10**8))
+# If remote functions with the same source are imported this many times, then
+# print a warning.
+DUPLICATE_REMOTE_FUNCTION_THRESHOLD = 100
+# The maximum resource quantity that is allowed. TODO(rkn): This could be
+# relaxed, but the current implementation of the node manager will be slower
+# for large resource quantities due to bookkeeping of specific resource IDs.
+MAX_RESOURCE_QUANTITY = 100e12
+# Number of units 1 resource can be subdivided into.
+MIN_RESOURCE_GRANULARITY = 0.0001
+# Set this environment variable to populate the dashboard URL with
+# an external hosted Ray dashboard URL (e.g. because the
+# dashboard is behind a proxy or load balancer). This only overrides
+# the dashboard URL when returning or printing to a user through a public
+# API, but not in the internal KV store.
+RAY_OVERRIDE_DASHBOARD_URL = "RAY_OVERRIDE_DASHBOARD_URL"
+# Different types of Ray errors that can be pushed to the driver.
+# TODO(rkn): These should be defined in flatbuffers and must be synced with
+# the existing C++ definitions.
+PICKLING_LARGE_OBJECT_PUSH_ERROR = "pickling_large_object"
+WAIT_FOR_FUNCTION_PUSH_ERROR = "wait_for_function"
+VERSION_MISMATCH_PUSH_ERROR = "version_mismatch"
+WORKER_CRASH_PUSH_ERROR = "worker_crash"
+WORKER_DIED_PUSH_ERROR = "worker_died"
+WORKER_POOL_LARGE_ERROR = "worker_pool_large"
+PUT_RECONSTRUCTION_PUSH_ERROR = "put_reconstruction"
+RESOURCE_DEADLOCK_ERROR = "resource_deadlock"
+REMOVED_NODE_ERROR = "node_removed"
+MONITOR_DIED_ERROR = "monitor_died"
+LOG_MONITOR_DIED_ERROR = "log_monitor_died"
+DASHBOARD_AGENT_DIED_ERROR = "dashboard_agent_died"
+DASHBOARD_DIED_ERROR = "dashboard_died"
+RAYLET_DIED_ERROR = "raylet_died"
+DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR = "detached_actor_anonymous_namespace"
+EXCESS_QUEUEING_WARNING = "excess_queueing_warning"
+# Used in gpu detection
+RESOURCE_CONSTRAINT_PREFIX = "accelerator_type:"
+# Used by autoscaler to set the node custom resources and labels
+# from cluster.yaml.
+RESOURCES_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_RESOURCES"
+LABELS_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_LABELS"
+# Temporary flag to disable log processing in the dashboard.  This is useful
+# if the dashboard is overloaded by logs and failing to process other
+# dashboard API requests (e.g. Job Submission).
+DISABLE_DASHBOARD_LOG_INFO = env_integer("RAY_DISABLE_DASHBOARD_LOG_INFO", 0)
+LOGGER_FORMAT = "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
+LOGGER_FORMAT_ESCAPE = json.dumps(LOGGER_FORMAT.replace("%", "%%"))
+LOGGER_FORMAT_HELP = f"The logging format. default={LOGGER_FORMAT_ESCAPE}"
+# Configure the default logging levels for various Ray components.
+# TODO (kevin85421): Currently, I don't encourage Ray users to configure
+# `RAY_LOGGER_LEVEL` until its scope and expected behavior are clear and
+# easy to understand. Now, only Ray developers should use it.
+LOGGER_LEVEL = os.environ.get("RAY_LOGGER_LEVEL", "info")
+LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
+LOGGER_LEVEL_HELP = (
+    "The logging level threshold, choices=['debug', 'info',"
+    " 'warning', 'error', 'critical'], default='info'"
+)
+LOGGING_ROTATE_BYTES = 512 * 1024 * 1024  # 512MB.
+LOGGING_ROTATE_BACKUP_COUNT = 5  # 5 Backup files at max.
+LOGGING_REDIRECT_STDERR_ENVIRONMENT_VARIABLE = "RAY_LOG_TO_STDERR"
+# Logging format when logging stderr. This should be formatted with the
+# component before setting the formatter, e.g. via
+#   format = LOGGER_FORMAT_STDERR.format(component="dashboard")
+#   handler.setFormatter(logging.Formatter(format))
+LOGGER_FORMAT_STDERR = (
+    "%(asctime)s\t%(levelname)s ({component}) %(filename)s:%(lineno)s -- %(message)s"
+)
+# Constants used to define the different process types.
+PROCESS_TYPE_REAPER = "reaper"
+PROCESS_TYPE_MONITOR = "monitor"
+PROCESS_TYPE_RAY_CLIENT_SERVER = "ray_client_server"
+PROCESS_TYPE_LOG_MONITOR = "log_monitor"
+# TODO(sang): Delete it.
+PROCESS_TYPE_REPORTER = "reporter"
+PROCESS_TYPE_DASHBOARD = "dashboard"
+PROCESS_TYPE_DASHBOARD_AGENT = "dashboard_agent"
+PROCESS_TYPE_RUNTIME_ENV_AGENT = "runtime_env_agent"
+PROCESS_TYPE_WORKER = "worker"
+PROCESS_TYPE_RAYLET = "raylet"
+PROCESS_TYPE_REDIS_SERVER = "redis_server"
+PROCESS_TYPE_WEB_UI = "web_ui"
+PROCESS_TYPE_GCS_SERVER = "gcs_server"
+PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER = "python-core-driver"
+PROCESS_TYPE_PYTHON_CORE_WORKER = "python-core-worker"
+# Log file names
+MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log"
+LOG_MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_LOG_MONITOR}.log"
+# Enable log deduplication.
+RAY_DEDUP_LOGS = env_bool("RAY_DEDUP_LOGS", True)
+# How many seconds of messages to buffer for log deduplication.
+RAY_DEDUP_LOGS_AGG_WINDOW_S = env_integer("RAY_DEDUP_LOGS_AGG_WINDOW_S", 5)
+# Regex for log messages to never deduplicate, or None. This takes precedence over
+# the skip regex below. A default pattern is set for testing.
+TESTING_NEVER_DEDUP_TOKEN = "__ray_testing_never_deduplicate__"
+RAY_DEDUP_LOGS_ALLOW_REGEX = os.environ.get(
+    "RAY_DEDUP_LOGS_ALLOW_REGEX", TESTING_NEVER_DEDUP_TOKEN
+)
+# Regex for log messages to always skip / suppress, or None.
+RAY_DEDUP_LOGS_SKIP_REGEX = os.environ.get("RAY_DEDUP_LOGS_SKIP_REGEX")
+WORKER_PROCESS_TYPE_IDLE_WORKER = "ray::IDLE"
+WORKER_PROCESS_TYPE_SPILL_WORKER_NAME = "SpillWorker"
+WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME = "RestoreWorker"
+WORKER_PROCESS_TYPE_SPILL_WORKER_IDLE = (
+    f"ray::IDLE_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
+)
+WORKER_PROCESS_TYPE_RESTORE_WORKER_IDLE = (
+    f"ray::IDLE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
+)
+WORKER_PROCESS_TYPE_SPILL_WORKER = f"ray::SPILL_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
+WORKER_PROCESS_TYPE_RESTORE_WORKER = (
+    f"ray::RESTORE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
+)
+WORKER_PROCESS_TYPE_SPILL_WORKER_DELETE = (
+    f"ray::DELETE_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
+)
+WORKER_PROCESS_TYPE_RESTORE_WORKER_DELETE = (
+    f"ray::DELETE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
+)
+# The number of files the log monitor will open. If more files exist, they will
+# be ignored.
+LOG_MONITOR_MAX_OPEN_FILES = int(
+    os.environ.get("RAY_LOG_MONITOR_MAX_OPEN_FILES", "200")
+)
+# The maximum batch of lines to be read in a single iteration. We _always_ try
+# to read this number of lines even if there aren't any new lines.
+LOG_MONITOR_NUM_LINES_TO_READ = int(
+    os.environ.get("RAY_LOG_MONITOR_NUM_LINES_TO_READ", "1000")
+)
+# Autoscaler events are denoted by the ":event_summary:" magic token.
+LOG_PREFIX_EVENT_SUMMARY = ":event_summary:"
+# Cluster-level info events are denoted by the ":info_message:" magic token. These may
+# be emitted in the stderr of Ray components.
+LOG_PREFIX_INFO_MESSAGE = ":info_message:"
+# Actor names are recorded in the logs with this magic token as a prefix.
+LOG_PREFIX_ACTOR_NAME = ":actor_name:"
+# Task names are recorded in the logs with this magic token as a prefix.
+LOG_PREFIX_TASK_NAME = ":task_name:"
+# Job ids are recorded in the logs with this magic token as a prefix.
+LOG_PREFIX_JOB_ID = ":job_id:"
+# The object metadata field uses the following format: It is a comma
+# separated list of fields. The first field is mandatory and is the
+# type of the object (see types below) or an integer, which is interpreted
+# as an error value. The second part is optional and if present has the
+# form DEBUG:<breakpoint_id>, it is used for implementing the debugger.
+# A constant used as object metadata to indicate the object is cross language.
+OBJECT_METADATA_TYPE_CROSS_LANGUAGE = b"XLANG"
+# A constant used as object metadata to indicate the object is python specific.
+OBJECT_METADATA_TYPE_PYTHON = b"PYTHON"
+# A constant used as object metadata to indicate the object is raw bytes.
+OBJECT_METADATA_TYPE_RAW = b"RAW"
+# A constant used as object metadata to indicate the object is an actor handle.
+# This value should be synchronized with the Java definition in
+# ObjectSerializer.java
+# TODO(fyrestone): Serialize the ActorHandle via the custom type feature
+# of XLANG.
+OBJECT_METADATA_TYPE_ACTOR_HANDLE = b"ACTOR_HANDLE"
+# A constant indicating the debugging part of the metadata (see above).
+OBJECT_METADATA_DEBUG_PREFIX = b"DEBUG:"
+AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"
+REDIS_DEFAULT_USERNAME = ""
+REDIS_DEFAULT_PASSWORD = ""
+# The default ip address to bind to.
+NODE_DEFAULT_IP = "127.0.0.1"
+# The Mach kernel page size in bytes.
+MACH_PAGE_SIZE_BYTES = 4096
+# The max number of bytes for task execution error message.
+MAX_APPLICATION_ERROR_LEN = 500
+# Max 64 bit integer value, which is needed to ensure against overflow
+# in C++ when passing integer values cross-language.
+MAX_INT64_VALUE = 9223372036854775807
+# Object Spilling related constants
+DEFAULT_OBJECT_PREFIX = "ray_spilled_objects"
+GCS_PORT_ENVIRONMENT_VARIABLE = "RAY_GCS_SERVER_PORT"
+HEALTHCHECK_EXPIRATION_S = os.environ.get("RAY_HEALTHCHECK_EXPIRATION_S", 10)
+# Filename of "shim process" that sets up Python worker environment.
+# Should be kept in sync with kSetupWorkerFilename in
+# src/ray/common/constants.h.
+SETUP_WORKER_FILENAME = "setup_worker.py"
+# Directory name where runtime_env resources will be created & cached.
+DEFAULT_RUNTIME_ENV_DIR_NAME = "runtime_resources"
+# The timeout seconds for the creation of runtime env,
+# dafault timeout is 10 minutes
+DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS = 600
+# Used to separate lines when formatting the call stack where an ObjectRef was
+# created.
+CALL_STACK_LINE_DELIMITER = " | "
+# The default gRPC max message size is 4 MiB, we use a larger number of 250 MiB
+# NOTE: This is equal to the C++ limit of (RAY_CONFIG::max_grpc_message_size)
+GRPC_CPP_MAX_MESSAGE_SIZE = 250 * 1024 * 1024
+# The gRPC send & receive max length for "dashboard agent" server.
+# NOTE: This is equal to the C++ limit of RayConfig::max_grpc_message_size
+#       and HAVE TO STAY IN SYNC with it (ie, meaning that both of these values
+#       have to be set at the same time)
+AGENT_GRPC_MAX_MESSAGE_LENGTH = env_integer(
+    "AGENT_GRPC_MAX_MESSAGE_LENGTH", 20 * 1024 * 1024  # 20MB
+)
+# GRPC options
+GRPC_ENABLE_HTTP_PROXY = (
+    1
+    if os.environ.get("RAY_grpc_enable_http_proxy", "0").lower() in ("1", "true")
+    else 0
+)
+GLOBAL_GRPC_OPTIONS = (("grpc.enable_http_proxy", GRPC_ENABLE_HTTP_PROXY),)
+# Internal kv namespaces
+KV_NAMESPACE_DASHBOARD = b"dashboard"
+KV_NAMESPACE_SESSION = b"session"
+KV_NAMESPACE_TRACING = b"tracing"
+KV_NAMESPACE_PDB = b"ray_pdb"
+KV_NAMESPACE_HEALTHCHECK = b"healthcheck"
+KV_NAMESPACE_JOB = b"job"
+KV_NAMESPACE_CLUSTER = b"cluster"
+KV_HEAD_NODE_ID_KEY = b"head_node_id"
+# TODO: Set package for runtime env
+# We need to update ray client for this since runtime env use ray client
+# This might introduce some compatibility issues so leave it here for now.
+KV_NAMESPACE_PACKAGE = None
+KV_NAMESPACE_SERVE = b"serve"
+KV_NAMESPACE_FUNCTION_TABLE = b"fun"
+LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]
+# Accelerator constants
+NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
+CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
+ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES"
+NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
+TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"
+NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
+NEURON_CORES = "neuron_cores"
+GPU = "GPU"
+TPU = "TPU"
+NPU = "NPU"
+HPU = "HPU"
+RAY_WORKER_NICENESS = "RAY_worker_niceness"
+# Default max_retries option in @ray.remote for non-actor
+# tasks.
+DEFAULT_TASK_MAX_RETRIES = 3
+# Default max_concurrency option in @ray.remote for threaded actors.
+DEFAULT_MAX_CONCURRENCY_THREADED = 1
+# Default max_concurrency option in @ray.remote for async actors.
+DEFAULT_MAX_CONCURRENCY_ASYNC = 1000
+# Prefix for namespaces which are used internally by ray.
+# Jobs within these namespaces should be hidden from users
+# and should not be considered user activity.
+# Please keep this in sync with the definition kRayInternalNamespacePrefix
+# in /src/ray/gcs/gcs_server/gcs_job_manager.h.
+RAY_INTERNAL_NAMESPACE_PREFIX = "_ray_internal_"
+RAY_INTERNAL_DASHBOARD_NAMESPACE = f"{RAY_INTERNAL_NAMESPACE_PREFIX}dashboard"
+# Ray internal flags. These flags should not be set by users, and we strip them on job
+# submission.
+# This should be consistent with src/ray/common/ray_internal_flag_def.h
+RAY_INTERNAL_FLAGS = [
+    "RAY_JOB_ID",
+    "RAY_RAYLET_PID",
+    "RAY_OVERRIDE_NODE_ID_FOR_TESTING",
+]
+def gcs_actor_scheduling_enabled():
+    return os.environ.get("RAY_gcs_actor_scheduling_enabled") == "true"
+DEFAULT_RESOURCES = {"CPU", "GPU", "memory", "object_store_memory"}
+# Supported Python versions for runtime env's "conda" field. Ray downloads
+# Ray wheels into the conda environment, so the Ray wheels for these Python
+# versions must be available online.
+RUNTIME_ENV_CONDA_PY_VERSIONS = [(3, 9), (3, 10), (3, 11), (3, 12)]
+# Whether to enable Ray clusters (in addition to local Ray).
+# Ray clusters are not explicitly supported for Windows and OSX.
+IS_WINDOWS_OR_OSX = sys.platform == "darwin" or sys.platform == "win32"
+ENABLE_RAY_CLUSTERS_ENV_VAR = "RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER"
+ENABLE_RAY_CLUSTER = env_bool(
+    ENABLE_RAY_CLUSTERS_ENV_VAR,
+    not IS_WINDOWS_OR_OSX,
+)
+SESSION_LATEST = "session_latest"
+NUM_PORT_RETRIES = 40
+NUM_REDIS_GET_RETRIES = int(os.environ.get("RAY_NUM_REDIS_GET_RETRIES", "20"))
+# The allowed cached ports in Ray. Refer to Port configuration for more details:
+# https://docs.ray.io/en/latest/ray-core/configure.html#ports-configurations
+RAY_ALLOWED_CACHED_PORTS = {
+    "metrics_agent_port",
+    "metrics_export_port",
+    "dashboard_agent_listen_port",
+    "runtime_env_agent_port",
+    "gcs_server_port",  # the `port` option for gcs port.
+}
+# Turn this on if actor task log's offsets are expected to be recorded.
+# With this enabled, actor tasks' log could be queried with task id.
+RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING = env_bool(
+    "RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING", False
+)
+# RuntimeEnv env var to indicate it exports a function
+WORKER_PROCESS_SETUP_HOOK_ENV_VAR = "__RAY_WORKER_PROCESS_SETUP_HOOK_ENV_VAR"
+RAY_WORKER_PROCESS_SETUP_HOOK_LOAD_TIMEOUT_ENV_VAR = (
+    "RAY_WORKER_PROCESS_SETUP_HOOK_LOAD_TIMEOUT"  # noqa
+)
+RAY_DEFAULT_LABEL_KEYS_PREFIX = "ray.io/"
+RAY_TPU_MAX_CONCURRENT_CONNECTIONS_ENV_VAR = "RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS"
+RAY_NODE_IP_FILENAME = "node_ip_address.json"
+PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME = "bundle"
+RAY_LOGGING_CONFIG_ENCODING = os.environ.get("RAY_LOGGING_CONFIG_ENCODING")
+RAY_BACKEND_LOG_JSON_ENV_VAR = "RAY_BACKEND_LOG_JSON"
+# Write export API event of all resource types to file if enabled.
+# RAY_enable_export_api_write_config will not be considered if
+# this is enabled.
+RAY_ENABLE_EXPORT_API_WRITE = env_bool("RAY_enable_export_api_write", False)
+# Comma separated string containing individual resource
+# to write export API events for. This configuration is only used if
+# RAY_enable_export_api_write is not enabled. Full list of valid
+# resource types in ExportEvent.SourceType enum in
+# src/ray/protobuf/export_api/export_event.proto
+# Example config:
+# `export RAY_enable_export_api_write_config='EXPORT_SUBMISSION_JOB,EXPORT_ACTOR'`
+RAY_ENABLE_EXPORT_API_WRITE_CONFIG_STR = os.environ.get(
+    "RAY_enable_export_api_write_config", ""
+)
+RAY_ENABLE_EXPORT_API_WRITE_CONFIG = RAY_ENABLE_EXPORT_API_WRITE_CONFIG_STR.split(",")
+RAY_EXPORT_EVENT_MAX_FILE_SIZE_BYTES = env_bool(
+    "RAY_EXPORT_EVENT_MAX_FILE_SIZE_BYTES", 100 * 1e6
+)
+RAY_EXPORT_EVENT_MAX_BACKUP_COUNT = env_bool("RAY_EXPORT_EVENT_MAX_BACKUP_COUNT", 20)

.venv/lib/python3.11/site-packages/ray/_private/ray_experimental_perf.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""This is the script for `ray microbenchmark`."""
+import asyncio
+import logging
+from ray._private.ray_microbenchmark_helpers import timeit, asyncio_timeit
+import multiprocessing
+import ray
+from ray.dag.compiled_dag_node import CompiledDAG
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+import ray.experimental.channel as ray_channel
+from ray.dag import InputNode, MultiOutputNode
+from ray._private.utils import (
+    get_or_create_event_loop,
+)
+from ray._private.test_utils import get_actor_node_id
+logger = logging.getLogger(__name__)
+@ray.remote
+class DAGActor:
+    def echo(self, x):
+        return x
+    def echo_multiple(self, *x):
+        return x
+def check_optimized_build():
+    if not ray._raylet.OPTIMIZED:
+        msg = (
+            "WARNING: Unoptimized build! "
+            "To benchmark an optimized build, try:\n"
+            "\tbazel build -c opt //:ray_pkg\n"
+            "You can also make this permanent by adding\n"
+            "\tbuild --compilation_mode=opt\n"
+            "to your user-wide ~/.bazelrc file. "
+            "(Do not add this to the project-level .bazelrc file.)"
+        )
+        logger.warning(msg)
+def create_driver_actor():
+    return CompiledDAG.DAGDriverProxyActor.options(
+        scheduling_strategy=NodeAffinitySchedulingStrategy(
+            ray.get_runtime_context().get_node_id(), soft=False
+        )
+    ).remote()
+def main(results=None):
+    results = results or []
+    loop = get_or_create_event_loop()
+    check_optimized_build()
+    print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks")
+    #################################################
+    # Perf tests for channels, used in compiled DAGs.
+    #################################################
+    ray.init()
+    def put_channel_small(chans, do_get=False):
+        for chan in chans:
+            chan.write(b"0")
+            if do_get:
+                chan.read()
+    @ray.remote
+    class ChannelReader:
+        def ready(self):
+            return
+        def read(self, chans):
+            while True:
+                for chan in chans:
+                    chan.read()
+    driver_actor = create_driver_actor()
+    driver_node = get_actor_node_id(driver_actor)
+    chans = [ray_channel.Channel(None, [(driver_actor, driver_node)], 1000)]
+    results += timeit(
+        "[unstable] local put:local get, single channel calls",
+        lambda: put_channel_small(chans, do_get=True),
+    )
+    reader = ChannelReader.remote()
+    reader_node = get_actor_node_id(reader)
+    chans = [ray_channel.Channel(None, [(reader, reader_node)], 1000)]
+    ray.get(reader.ready.remote())
+    reader.read.remote(chans)
+    results += timeit(
+        "[unstable] local put:1 remote get, single channel calls",
+        lambda: put_channel_small(chans),
+    )
+    ray.kill(reader)
+    n_cpu = multiprocessing.cpu_count() // 2
+    print(f"Testing multiple readers/channels, n={n_cpu}")
+    reader_and_node_list = []
+    for _ in range(n_cpu):
+        reader = ChannelReader.remote()
+        reader_node = get_actor_node_id(reader)
+        reader_and_node_list.append((reader, reader_node))
+    chans = [ray_channel.Channel(None, reader_and_node_list, 1000)]
+    ray.get([reader.ready.remote() for reader, _ in reader_and_node_list])
+    for reader, _ in reader_and_node_list:
+        reader.read.remote(chans)
+    results += timeit(
+        "[unstable] local put:n remote get, single channel calls",
+        lambda: put_channel_small(chans),
+    )
+    for reader, _ in reader_and_node_list:
+        ray.kill(reader)
+    reader = ChannelReader.remote()
+    reader_node = get_actor_node_id(reader)
+    chans = [
+        ray_channel.Channel(None, [(reader, reader_node)], 1000) for _ in range(n_cpu)
+    ]
+    ray.get(reader.ready.remote())
+    reader.read.remote(chans)
+    results += timeit(
+        "[unstable] local put:1 remote get, n channels calls",
+        lambda: put_channel_small(chans),
+    )
+    ray.kill(reader)
+    reader_and_node_list = []
+    for _ in range(n_cpu):
+        reader = ChannelReader.remote()
+        reader_node = get_actor_node_id(reader)
+        reader_and_node_list.append((reader, reader_node))
+    chans = [
+        ray_channel.Channel(None, [reader_and_node_list[i]], 1000) for i in range(n_cpu)
+    ]
+    ray.get([reader.ready.remote() for reader, _ in reader_and_node_list])
+    for chan, reader_node_tuple in zip(chans, reader_and_node_list):
+        reader = reader_node_tuple[0]
+        reader.read.remote([chan])
+    results += timeit(
+        "[unstable] local put:n remote get, n channels calls",
+        lambda: put_channel_small(chans),
+    )
+    for reader, _ in reader_and_node_list:
+        ray.kill(reader)
+    # Tests for compiled DAGs.
+    def _exec(dag, num_args=1, payload_size=1):
+        output_ref = dag.execute(*[b"x" * payload_size for _ in range(num_args)])
+        ray.get(output_ref)
+    async def exec_async(tag):
+        async def _exec_async():
+            fut = await compiled_dag.execute_async(b"x")
+            if not isinstance(fut, list):
+                await fut
+            else:
+                await asyncio.gather(*fut)
+        return await asyncio_timeit(
+            tag,
+            _exec_async,
+        )
+    # Single-actor DAG calls
+    a = DAGActor.remote()
+    with InputNode() as inp:
+        dag = a.echo.bind(inp)
+    results += timeit(
+        "[unstable] single-actor DAG calls", lambda: ray.get(dag.execute(b"x"))
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        "[unstable] compiled single-actor DAG calls", lambda: _exec(compiled_dag)
+    )
+    del a
+    # Single-actor asyncio DAG calls
+    a = DAGActor.remote()
+    with InputNode() as inp:
+        dag = a.echo.bind(inp)
+    compiled_dag = dag.experimental_compile(enable_asyncio=True)
+    results += loop.run_until_complete(
+        exec_async(
+            "[unstable] compiled single-actor asyncio DAG calls",
+        )
+    )
+    del a
+    # Scatter-gather DAG calls
+    n_cpu = multiprocessing.cpu_count() // 2
+    actors = [DAGActor.remote() for _ in range(n_cpu)]
+    with InputNode() as inp:
+        dag = MultiOutputNode([a.echo.bind(inp) for a in actors])
+    results += timeit(
+        f"[unstable] scatter-gather DAG calls, n={n_cpu} actors",
+        lambda: ray.get(dag.execute(b"x")),
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        f"[unstable] compiled scatter-gather DAG calls, n={n_cpu} actors",
+        lambda: _exec(compiled_dag),
+    )
+    # Scatter-gather asyncio DAG calls
+    actors = [DAGActor.remote() for _ in range(n_cpu)]
+    with InputNode() as inp:
+        dag = MultiOutputNode([a.echo.bind(inp) for a in actors])
+    compiled_dag = dag.experimental_compile(enable_asyncio=True)
+    results += loop.run_until_complete(
+        exec_async(
+            f"[unstable] compiled scatter-gather asyncio DAG calls, n={n_cpu} actors",
+        )
+    )
+    # Chain DAG calls
+    actors = [DAGActor.remote() for _ in range(n_cpu)]
+    with InputNode() as inp:
+        dag = inp
+        for a in actors:
+            dag = a.echo.bind(dag)
+    results += timeit(
+        f"[unstable] chain DAG calls, n={n_cpu} actors",
+        lambda: ray.get(dag.execute(b"x")),
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        f"[unstable] compiled chain DAG calls, n={n_cpu} actors",
+        lambda: _exec(compiled_dag),
+    )
+    # Chain asyncio DAG calls
+    actors = [DAGActor.remote() for _ in range(n_cpu)]
+    with InputNode() as inp:
+        dag = inp
+        for a in actors:
+            dag = a.echo.bind(dag)
+    compiled_dag = dag.experimental_compile(enable_asyncio=True)
+    results += loop.run_until_complete(
+        exec_async(f"[unstable] compiled chain asyncio DAG calls, n={n_cpu} actors")
+    )
+    # Multiple args with small payloads
+    n_actors = 8
+    assert (
+        n_cpu > n_actors
+    ), f"n_cpu ({n_cpu}) must be greater than n_actors ({n_actors})"
+    actors = [DAGActor.remote() for _ in range(n_actors)]
+    with InputNode() as inp:
+        dag = MultiOutputNode([actors[i].echo.bind(inp[i]) for i in range(n_actors)])
+    payload_size = 1
+    results += timeit(
+        f"[unstable] multiple args with small payloads DAG calls, n={n_actors} actors",
+        lambda: ray.get(dag.execute(*[b"x" * payload_size for _ in range(n_actors)])),
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        f"[unstable] compiled multiple args with small payloads DAG calls, "
+        f"n={n_actors} actors",
+        lambda: _exec(compiled_dag, num_args=n_actors, payload_size=payload_size),
+    )
+    # Multiple args with medium payloads
+    actors = [DAGActor.remote() for _ in range(n_actors)]
+    with InputNode() as inp:
+        dag = MultiOutputNode([actors[i].echo.bind(inp[i]) for i in range(n_actors)])
+    payload_size = 1024 * 1024
+    results += timeit(
+        f"[unstable] multiple args with medium payloads DAG calls, n={n_actors} actors",
+        lambda: ray.get(dag.execute(*[b"x" * payload_size for _ in range(n_actors)])),
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        "[unstable] compiled multiple args with medium payloads DAG calls, "
+        f"n={n_actors} actors",
+        lambda: _exec(compiled_dag, num_args=n_actors, payload_size=payload_size),
+    )
+    # Multiple args with large payloads
+    actors = [DAGActor.remote() for _ in range(n_actors)]
+    with InputNode() as inp:
+        dag = MultiOutputNode([actors[i].echo.bind(inp[i]) for i in range(n_actors)])
+    payload_size = 10 * 1024 * 1024
+    results += timeit(
+        f"[unstable] multiple args with large payloads DAG calls, n={n_actors} actors",
+        lambda: ray.get(dag.execute(*[b"x" * payload_size for _ in range(n_actors)])),
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        "[unstable] compiled multiple args with large payloads DAG calls, "
+        f"n={n_actors} actors",
+        lambda: _exec(compiled_dag, num_args=n_actors, payload_size=payload_size),
+    )
+    # Worst case for multiple arguments: a single actor takes all the arguments
+    # with small payloads.
+    actor = DAGActor.remote()
+    n_args = 8
+    with InputNode() as inp:
+        dag = actor.echo_multiple.bind(*[inp[i] for i in range(n_args)])
+    payload_size = 1
+    results += timeit(
+        "[unstable] single-actor with all args with small payloads DAG calls, "
+        "n=1 actors",
+        lambda: ray.get(dag.execute(*[b"x" * payload_size for _ in range(n_args)])),
+    )
+    compiled_dag = dag.experimental_compile()
+    results += timeit(
+        "[unstable] single-actor with all args with small payloads DAG calls, "
+        "n=1 actors",
+        lambda: _exec(compiled_dag, num_args=n_args, payload_size=payload_size),
+    )
+    ray.shutdown()
+    return results
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/_private/ray_microbenchmark_helpers.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import time
+from typing import List, Optional, Tuple
+import os
+import ray
+import numpy as np
+from contextlib import contextmanager
+# Only run tests matching this filter pattern.
+filter_pattern = os.environ.get("TESTS_TO_RUN", "")
+skip_pattern = os.environ.get("TESTS_TO_SKIP", "")
+def timeit(
+    name, fn, multiplier=1, warmup_time_sec=10
+) -> List[Optional[Tuple[str, float, float]]]:
+    if filter_pattern and filter_pattern not in name:
+        return [None]
+    if skip_pattern and skip_pattern in name:
+        return [None]
+    # sleep for a while to avoid noisy neigbhors.
+    # related issue: https://github.com/ray-project/ray/issues/22045
+    time.sleep(warmup_time_sec)
+    # warmup
+    start = time.perf_counter()
+    count = 0
+    while time.perf_counter() - start < 1:
+        fn()
+        count += 1
+    # real run
+    step = count // 10 + 1
+    stats = []
+    for _ in range(4):
+        start = time.perf_counter()
+        count = 0
+        while time.perf_counter() - start < 2:
+            for _ in range(step):
+                fn()
+            count += step
+        end = time.perf_counter()
+        stats.append(multiplier * count / (end - start))
+    mean = np.mean(stats)
+    sd = np.std(stats)
+    print(name, "per second", round(mean, 2), "+-", round(sd, 2))
+    return [(name, mean, sd)]
+async def asyncio_timeit(
+    name, async_fn, multiplier=1, warmup_time_sec=10
+) -> List[Optional[Tuple[str, float, float]]]:
+    if filter_pattern and filter_pattern not in name:
+        return [None]
+    if skip_pattern and skip_pattern in name:
+        return [None]
+    # sleep for a while to avoid noisy neigbhors.
+    # related issue: https://github.com/ray-project/ray/issues/22045
+    time.sleep(warmup_time_sec)
+    # warmup
+    start = time.perf_counter()
+    count = 0
+    while time.perf_counter() - start < 1:
+        await async_fn()
+        count += 1
+    # real run
+    step = count // 10 + 1
+    stats = []
+    for _ in range(4):
+        start = time.perf_counter()
+        count = 0
+        while time.perf_counter() - start < 2:
+            for _ in range(step):
+                await async_fn()
+            count += step
+        end = time.perf_counter()
+        stats.append(multiplier * count / (end - start))
+    mean = np.mean(stats)
+    sd = np.std(stats)
+    print(name, "per second", round(mean, 2), "+-", round(sd, 2))
+    return [(name, mean, sd)]
+@contextmanager
+def ray_setup_and_teardown(**init_args):
+    ray.init(**init_args)
+    try:
+        yield None
+    finally:
+        ray.shutdown()

.venv/lib/python3.11/site-packages/ray/_private/ray_option_utils.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""Manage, parse and validate options for Ray tasks, actors and actor methods."""
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+import ray
+from ray._private import ray_constants
+from ray._private.utils import get_ray_doc_version
+from ray.util.placement_group import PlacementGroup
+from ray.util.scheduling_strategies import (
+    NodeAffinitySchedulingStrategy,
+    PlacementGroupSchedulingStrategy,
+    NodeLabelSchedulingStrategy,
+)
+@dataclass
+class Option:
+    # Type constraint of an option.
+    type_constraint: Optional[Union[type, Tuple[type]]] = None
+    # Value constraint of an option.
+    # The callable should return None if there is no error.
+    # Otherwise, return the error message.
+    value_constraint: Optional[Callable[[Any], Optional[str]]] = None
+    # Default value.
+    default_value: Any = None
+    def validate(self, keyword: str, value: Any):
+        """Validate the option."""
+        if self.type_constraint is not None:
+            if not isinstance(value, self.type_constraint):
+                raise TypeError(
+                    f"The type of keyword '{keyword}' must be {self.type_constraint}, "
+                    f"but received type {type(value)}"
+                )
+        if self.value_constraint is not None:
+            possible_error_message = self.value_constraint(value)
+            if possible_error_message:
+                raise ValueError(possible_error_message)
+def _counting_option(name: str, infinite: bool = True, default_value: Any = None):
+    """This is used for positive and discrete options.
+    Args:
+        name: The name of the option keyword.
+        infinite: If True, user could use -1 to represent infinity.
+        default_value: The default value for this option.
+    """
+    if infinite:
+        return Option(
+            (int, type(None)),
+            lambda x: None
+            if (x is None or x >= -1)
+            else f"The keyword '{name}' only accepts None, 0, -1"
+            " or a positive integer, where -1 represents infinity.",
+            default_value=default_value,
+        )
+    return Option(
+        (int, type(None)),
+        lambda x: None
+        if (x is None or x >= 0)
+        else f"The keyword '{name}' only accepts None, 0 or a positive integer.",
+        default_value=default_value,
+    )
+def _validate_resource_quantity(name, quantity):
+    if quantity < 0:
+        return f"The quantity of resource {name} cannot be negative"
+    if (
+        isinstance(quantity, float)
+        and quantity != 0.0
+        and int(quantity * ray._raylet.RESOURCE_UNIT_SCALING) == 0
+    ):
+        return (
+            f"The precision of the fractional quantity of resource {name}"
+            " cannot go beyond 0.0001"
+        )
+    resource_name = "GPU" if name == "num_gpus" else name
+    if resource_name in ray._private.accelerators.get_all_accelerator_resource_names():
+        (
+            valid,
+            error_message,
+        ) = ray._private.accelerators.get_accelerator_manager_for_resource(
+            resource_name
+        ).validate_resource_request_quantity(
+            quantity
+        )
+        if not valid:
+            return error_message
+    return None
+def _resource_option(name: str, default_value: Any = None):
+    """This is used for resource related options."""
+    return Option(
+        (float, int, type(None)),
+        lambda x: None if (x is None) else _validate_resource_quantity(name, x),
+        default_value=default_value,
+    )
+def _validate_resources(resources: Optional[Dict[str, float]]) -> Optional[str]:
+    if resources is None:
+        return None
+    if "CPU" in resources or "GPU" in resources:
+        return (
+            "Use the 'num_cpus' and 'num_gpus' keyword instead of 'CPU' and 'GPU' "
+            "in 'resources' keyword"
+        )
+    for name, quantity in resources.items():
+        possible_error_message = _validate_resource_quantity(name, quantity)
+        if possible_error_message:
+            return possible_error_message
+    return None
+_common_options = {
+    "accelerator_type": Option((str, type(None))),
+    "memory": _resource_option("memory"),
+    "name": Option((str, type(None))),
+    "num_cpus": _resource_option("num_cpus"),
+    "num_gpus": _resource_option("num_gpus"),
+    "object_store_memory": _counting_option("object_store_memory", False),
+    # TODO(suquark): "placement_group", "placement_group_bundle_index"
+    # and "placement_group_capture_child_tasks" are deprecated,
+    # use "scheduling_strategy" instead.
+    "placement_group": Option(
+        (type(None), str, PlacementGroup), default_value="default"
+    ),
+    "placement_group_bundle_index": Option(int, default_value=-1),
+    "placement_group_capture_child_tasks": Option((bool, type(None))),
+    "resources": Option((dict, type(None)), lambda x: _validate_resources(x)),
+    "runtime_env": Option((dict, type(None))),
+    "scheduling_strategy": Option(
+        (
+            type(None),
+            str,
+            PlacementGroupSchedulingStrategy,
+            NodeAffinitySchedulingStrategy,
+            NodeLabelSchedulingStrategy,
+        )
+    ),
+    "_metadata": Option((dict, type(None))),
+    "enable_task_events": Option(bool, default_value=True),
+    "_labels": Option((dict, type(None))),
+}
+def issubclass_safe(obj: Any, cls_: type) -> bool:
+    try:
+        return issubclass(obj, cls_)
+    except TypeError:
+        return False
+_task_only_options = {
+    "max_calls": _counting_option("max_calls", False, default_value=0),
+    # Normal tasks may be retried on failure this many times.
+    # TODO(swang): Allow this to be set globally for an application.
+    "max_retries": _counting_option(
+        "max_retries", default_value=ray_constants.DEFAULT_TASK_MAX_RETRIES
+    ),
+    # override "_common_options"
+    "num_cpus": _resource_option("num_cpus", default_value=1),
+    "num_returns": Option(
+        (int, str, type(None)),
+        lambda x: None
+        if (x is None or x == "dynamic" or x == "streaming" or x >= 0)
+        else "Default None. When None is passed, "
+        "The default value is 1 for a task and actor task, and "
+        "'streaming' for generator tasks and generator actor tasks. "
+        "The keyword 'num_returns' only accepts None, "
+        "a non-negative integer, "
+        "'streaming' (for generators), or 'dynamic'. 'dynamic' flag "
+        "will be deprecated in the future, and it is recommended to use "
+        "'streaming' instead.",
+        default_value=None,
+    ),
+    "object_store_memory": Option(  # override "_common_options"
+        (int, type(None)),
+        lambda x: None
+        if (x is None)
+        else "Setting 'object_store_memory' is not implemented for tasks",
+    ),
+    "retry_exceptions": Option(
+        (bool, list, tuple),
+        lambda x: None
+        if (
+            isinstance(x, bool)
+            or (
+                isinstance(x, (list, tuple))
+                and all(issubclass_safe(x_, Exception) for x_ in x)
+            )
+        )
+        else "retry_exceptions must be either a boolean or a list of exceptions",
+        default_value=False,
+    ),
+    "_generator_backpressure_num_objects": Option(
+        (int, type(None)),
+        lambda x: None
+        if x != 0
+        else (
+            "_generator_backpressure_num_objects=0 is not allowed. "
+            "Use a value > 0. If the value is equal to 1, the behavior "
+            "is identical to Python generator (generator 1 object "
+            "whenever `next` is called). Use -1 to disable this feature. "
+        ),
+    ),
+}
+_actor_only_options = {
+    "concurrency_groups": Option((list, dict, type(None))),
+    "lifetime": Option(
+        (str, type(None)),
+        lambda x: None
+        if x in (None, "detached", "non_detached")
+        else "actor `lifetime` argument must be one of 'detached', "
+        "'non_detached' and 'None'.",
+    ),
+    "max_concurrency": _counting_option("max_concurrency", False),
+    "max_restarts": _counting_option("max_restarts", default_value=0),
+    "max_task_retries": _counting_option("max_task_retries", default_value=0),
+    "max_pending_calls": _counting_option("max_pending_calls", default_value=-1),
+    "namespace": Option((str, type(None))),
+    "get_if_exists": Option(bool, default_value=False),
+}
+# Priority is important here because during dictionary update, same key with higher
+# priority overrides the same key with lower priority. We make use of priority
+# to set the correct default value for tasks / actors.
+# priority: _common_options > _actor_only_options > _task_only_options
+valid_options: Dict[str, Option] = {
+    **_task_only_options,
+    **_actor_only_options,
+    **_common_options,
+}
+# priority: _task_only_options > _common_options
+task_options: Dict[str, Option] = {**_common_options, **_task_only_options}
+# priority: _actor_only_options > _common_options
+actor_options: Dict[str, Option] = {**_common_options, **_actor_only_options}
+remote_args_error_string = (
+    "The @ray.remote decorator must be applied either with no arguments and no "
+    "parentheses, for example '@ray.remote', or it must be applied using some of "
+    f"the arguments in the list {list(valid_options.keys())}, for example "
+    "'@ray.remote(num_returns=2, resources={\"CustomResource\": 1})'."
+)
+def _check_deprecate_placement_group(options: Dict[str, Any]):
+    """Check if deprecated placement group option exists."""
+    placement_group = options.get("placement_group", "default")
+    scheduling_strategy = options.get("scheduling_strategy")
+    # TODO(suquark): @ray.remote(placement_group=None) is used in
+    # "python/ray.data._internal/remote_fn.py" and many other places,
+    # while "ray.data.read_api.read_datasource" set "scheduling_strategy=SPREAD".
+    # This might be a bug, but it is also ok to allow them co-exist.
+    if (placement_group not in ("default", None)) and (scheduling_strategy is not None):
+        raise ValueError(
+            "Placement groups should be specified via the "
+            "scheduling_strategy option. "
+            "The placement_group option is deprecated."
+        )
+def _warn_if_using_deprecated_placement_group(
+    options: Dict[str, Any], caller_stacklevel: int
+):
+    placement_group = options["placement_group"]
+    placement_group_bundle_index = options["placement_group_bundle_index"]
+    placement_group_capture_child_tasks = options["placement_group_capture_child_tasks"]
+    if placement_group != "default":
+        warnings.warn(
+            "placement_group parameter is deprecated. Use "
+            "scheduling_strategy=PlacementGroupSchedulingStrategy(...) "
+            "instead, see the usage at "
+            f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-remote.",  # noqa: E501
+            DeprecationWarning,
+            stacklevel=caller_stacklevel + 1,
+        )
+    if placement_group_bundle_index != -1:
+        warnings.warn(
+            "placement_group_bundle_index parameter is deprecated. Use "
+            "scheduling_strategy=PlacementGroupSchedulingStrategy(...) "
+            "instead, see the usage at "
+            f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-remote.",  # noqa: E501
+            DeprecationWarning,
+            stacklevel=caller_stacklevel + 1,
+        )
+    if placement_group_capture_child_tasks:
+        warnings.warn(
+            "placement_group_capture_child_tasks parameter is deprecated. Use "
+            "scheduling_strategy=PlacementGroupSchedulingStrategy(...) "
+            "instead, see the usage at "
+            f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-remote.",  # noqa: E501
+            DeprecationWarning,
+            stacklevel=caller_stacklevel + 1,
+        )
+def validate_task_options(options: Dict[str, Any], in_options: bool):
+    """Options check for Ray tasks.
+    Args:
+        options: Options for Ray tasks.
+        in_options: If True, we are checking the options under the context of
+            ".options()".
+    """
+    for k, v in options.items():
+        if k not in task_options:
+            raise ValueError(
+                f"Invalid option keyword {k} for remote functions. "
+                f"Valid ones are {list(task_options)}."
+            )
+        task_options[k].validate(k, v)
+    if in_options and "max_calls" in options:
+        raise ValueError("Setting 'max_calls' is not supported in '.options()'.")
+    _check_deprecate_placement_group(options)
+def validate_actor_options(options: Dict[str, Any], in_options: bool):
+    """Options check for Ray actors.
+    Args:
+        options: Options for Ray actors.
+        in_options: If True, we are checking the options under the context of
+            ".options()".
+    """
+    for k, v in options.items():
+        if k not in actor_options:
+            raise ValueError(
+                f"Invalid option keyword {k} for actors. "
+                f"Valid ones are {list(actor_options)}."
+            )
+        actor_options[k].validate(k, v)
+    if in_options and "concurrency_groups" in options:
+        raise ValueError(
+            "Setting 'concurrency_groups' is not supported in '.options()'."
+        )
+    if options.get("get_if_exists") and not options.get("name"):
+        raise ValueError("The actor name must be specified to use `get_if_exists`.")
+    if "object_store_memory" in options:
+        warnings.warn(
+            "Setting 'object_store_memory'"
+            " for actors is deprecated since it doesn't actually"
+            " reserve the required object store memory."
+            f" Use object spilling that's enabled by default (https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/objects/object-spilling.html) "  # noqa: E501
+            "instead to bypass the object store memory size limitation.",
+            DeprecationWarning,
+            stacklevel=1,
+        )
+    _check_deprecate_placement_group(options)
+def update_options(
+    original_options: Dict[str, Any], new_options: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Update original options with new options and return.
+    The returned updated options contain shallow copy of original options.
+    """
+    updated_options = {**original_options, **new_options}
+    # Ensure we update each namespace in "_metadata" independently.
+    # "_metadata" is a dict like {namespace1: config1, namespace2: config2}
+    if (
+        original_options.get("_metadata") is not None
+        and new_options.get("_metadata") is not None
+    ):
+        # make a shallow copy to avoid messing up the metadata dict in
+        # the original options.
+        metadata = original_options["_metadata"].copy()
+        for namespace, config in new_options["_metadata"].items():
+            metadata[namespace] = {**metadata.get(namespace, {}), **config}
+        updated_options["_metadata"] = metadata
+    return updated_options

.venv/lib/python3.11/site-packages/ray/_private/ray_perf.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""This is the script for `ray microbenchmark`."""
+import asyncio
+import logging
+from ray._private.ray_microbenchmark_helpers import timeit
+from ray._private.ray_client_microbenchmark import main as client_microbenchmark_main
+import numpy as np
+import multiprocessing
+import ray
+logger = logging.getLogger(__name__)
+@ray.remote(num_cpus=0)
+class Actor:
+    def small_value(self):
+        return b"ok"
+    def small_value_arg(self, x):
+        return b"ok"
+    def small_value_batch(self, n):
+        ray.get([small_value.remote() for _ in range(n)])
+@ray.remote
+class AsyncActor:
+    async def small_value(self):
+        return b"ok"
+    async def small_value_with_arg(self, x):
+        return b"ok"
+    async def small_value_batch(self, n):
+        await asyncio.wait([small_value.remote() for _ in range(n)])
+@ray.remote(num_cpus=0)
+class Client:
+    def __init__(self, servers):
+        if not isinstance(servers, list):
+            servers = [servers]
+        self.servers = servers
+    def small_value_batch(self, n):
+        results = []
+        for s in self.servers:
+            results.extend([s.small_value.remote() for _ in range(n)])
+        ray.get(results)
+    def small_value_batch_arg(self, n):
+        x = ray.put(0)
+        results = []
+        for s in self.servers:
+            results.extend([s.small_value_arg.remote(x) for _ in range(n)])
+        ray.get(results)
+@ray.remote
+def small_value():
+    return b"ok"
+@ray.remote
+def small_value_batch(n):
+    submitted = [small_value.remote() for _ in range(n)]
+    ray.get(submitted)
+    return 0
+@ray.remote
+def create_object_containing_ref():
+    obj_refs = []
+    for _ in range(10000):
+        obj_refs.append(ray.put(1))
+    return obj_refs
+def check_optimized_build():
+    if not ray._raylet.OPTIMIZED:
+        msg = (
+            "WARNING: Unoptimized build! "
+            "To benchmark an optimized build, try:\n"
+            "\tbazel build -c opt //:ray_pkg\n"
+            "You can also make this permanent by adding\n"
+            "\tbuild --compilation_mode=opt\n"
+            "to your user-wide ~/.bazelrc file. "
+            "(Do not add this to the project-level .bazelrc file.)"
+        )
+        logger.warning(msg)
+def main(results=None):
+    results = results or []
+    check_optimized_build()
+    print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks")
+    ray.init()
+    value = ray.put(0)
+    def get_small():
+        ray.get(value)
+    def put_small():
+        ray.put(0)
+    @ray.remote
+    def do_put_small():
+        for _ in range(100):
+            ray.put(0)
+    def put_multi_small():
+        ray.get([do_put_small.remote() for _ in range(10)])
+    arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
+    results += timeit("single client get calls (Plasma Store)", get_small)
+    results += timeit("single client put calls (Plasma Store)", put_small)
+    results += timeit("multi client put calls (Plasma Store)", put_multi_small, 1000)
+    def put_large():
+        ray.put(arr)
+    results += timeit("single client put gigabytes", put_large, 8 * 0.1)
+    def small_value_batch():
+        submitted = [small_value.remote() for _ in range(1000)]
+        ray.get(submitted)
+        return 0
+    results += timeit("single client tasks and get batch", small_value_batch)
+    @ray.remote
+    def do_put():
+        for _ in range(10):
+            ray.put(np.zeros(10 * 1024 * 1024, dtype=np.int64))
+    def put_multi():
+        ray.get([do_put.remote() for _ in range(10)])
+    results += timeit("multi client put gigabytes", put_multi, 10 * 8 * 0.1)
+    obj_containing_ref = create_object_containing_ref.remote()
+    def get_containing_object_ref():
+        ray.get(obj_containing_ref)
+    results += timeit(
+        "single client get object containing 10k refs", get_containing_object_ref
+    )
+    def wait_multiple_refs():
+        num_objs = 1000
+        not_ready = [small_value.remote() for _ in range(num_objs)]
+        # We only need to trigger the fetch_local once for each object,
+        # raylet will persist these fetch requests even after ray.wait returns.
+        # See https://github.com/ray-project/ray/issues/30375.
+        fetch_local = True
+        for _ in range(num_objs):
+            _ready, not_ready = ray.wait(not_ready, fetch_local=fetch_local)
+            if fetch_local:
+                fetch_local = False
+    results += timeit("single client wait 1k refs", wait_multiple_refs)
+    def small_task():
+        ray.get(small_value.remote())
+    results += timeit("single client tasks sync", small_task)
+    def small_task_async():
+        ray.get([small_value.remote() for _ in range(1000)])
+    results += timeit("single client tasks async", small_task_async, 1000)
+    n = 10000
+    m = 4
+    actors = [Actor.remote() for _ in range(m)]
+    def multi_task():
+        submitted = [a.small_value_batch.remote(n) for a in actors]
+        ray.get(submitted)
+    results += timeit("multi client tasks async", multi_task, n * m)
+    a = Actor.remote()
+    def actor_sync():
+        ray.get(a.small_value.remote())
+    results += timeit("1:1 actor calls sync", actor_sync)
+    a = Actor.remote()
+    def actor_async():
+        ray.get([a.small_value.remote() for _ in range(1000)])
+    results += timeit("1:1 actor calls async", actor_async, 1000)
+    a = Actor.options(max_concurrency=16).remote()
+    def actor_concurrent():
+        ray.get([a.small_value.remote() for _ in range(1000)])
+    results += timeit("1:1 actor calls concurrent", actor_concurrent, 1000)
+    n = 5000
+    n_cpu = multiprocessing.cpu_count() // 2
+    actors = [Actor._remote() for _ in range(n_cpu)]
+    client = Client.remote(actors)
+    def actor_async_direct():
+        ray.get(client.small_value_batch.remote(n))
+    results += timeit("1:n actor calls async", actor_async_direct, n * len(actors))
+    n_cpu = multiprocessing.cpu_count() // 2
+    a = [Actor.remote() for _ in range(n_cpu)]
+    @ray.remote
+    def work(actors):
+        ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)])
+    def actor_multi2():
+        ray.get([work.remote(a) for _ in range(m)])
+    results += timeit("n:n actor calls async", actor_multi2, m * n)
+    n = 1000
+    actors = [Actor._remote() for _ in range(n_cpu)]
+    clients = [Client.remote(a) for a in actors]
+    def actor_multi2_direct_arg():
+        ray.get([c.small_value_batch_arg.remote(n) for c in clients])
+    results += timeit(
+        "n:n actor calls with arg async", actor_multi2_direct_arg, n * len(clients)
+    )
+    a = AsyncActor.remote()
+    def actor_sync():
+        ray.get(a.small_value.remote())
+    results += timeit("1:1 async-actor calls sync", actor_sync)
+    a = AsyncActor.remote()
+    def async_actor():
+        ray.get([a.small_value.remote() for _ in range(1000)])
+    results += timeit("1:1 async-actor calls async", async_actor, 1000)
+    a = AsyncActor.remote()
+    def async_actor():
+        ray.get([a.small_value_with_arg.remote(i) for i in range(1000)])
+    results += timeit("1:1 async-actor calls with args async", async_actor, 1000)
+    n = 5000
+    n_cpu = multiprocessing.cpu_count() // 2
+    actors = [AsyncActor.remote() for _ in range(n_cpu)]
+    client = Client.remote(actors)
+    def async_actor_async():
+        ray.get(client.small_value_batch.remote(n))
+    results += timeit("1:n async-actor calls async", async_actor_async, n * len(actors))
+    n = 5000
+    m = 4
+    n_cpu = multiprocessing.cpu_count() // 2
+    a = [AsyncActor.remote() for _ in range(n_cpu)]
+    @ray.remote
+    def async_actor_work(actors):
+        ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)])
+    def async_actor_multi():
+        ray.get([async_actor_work.remote(a) for _ in range(m)])
+    results += timeit("n:n async-actor calls async", async_actor_multi, m * n)
+    ray.shutdown()
+    ############################
+    # End of channel perf tests.
+    ############################
+    NUM_PGS = 100
+    NUM_BUNDLES = 1
+    ray.init(resources={"custom": 100})
+    def placement_group_create_removal(num_pgs):
+        pgs = [
+            ray.util.placement_group(
+                bundles=[{"custom": 0.001} for _ in range(NUM_BUNDLES)]
+            )
+            for _ in range(num_pgs)
+        ]
+        [pg.wait(timeout_seconds=30) for pg in pgs]
+        # Include placement group removal here to clean up.
+        # If we don't clean up placement groups, the whole performance
+        # gets slower as it runs more.
+        # Since timeit function runs multiple times without
+        # the cleaning logic, we should have this method here.
+        for pg in pgs:
+            ray.util.remove_placement_group(pg)
+    results += timeit(
+        "placement group create/removal",
+        lambda: placement_group_create_removal(NUM_PGS),
+        NUM_PGS,
+    )
+    ray.shutdown()
+    client_microbenchmark_main(results)
+    return results
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/_private/ray_process_reaper.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import atexit
+import os
+import signal
+import sys
+import time
+"""
+This is a lightweight "reaper" process used to ensure that ray processes are
+cleaned up properly when the main ray process dies unexpectedly (e.g.,
+segfaults or gets SIGKILLed). Note that processes may not be cleaned up
+properly if this process is SIGTERMed or SIGKILLed.
+It detects that its parent has died by reading from stdin, which must be
+inherited from the parent process so that the OS will deliver an EOF if the
+parent dies. When this happens, the reaper process kills the rest of its
+process group (first attempting graceful shutdown with SIGTERM, then escalating
+to SIGKILL).
+"""
+SIGTERM_GRACE_PERIOD_SECONDS = 1
+def reap_process_group(*args):
+    def sigterm_handler(*args):
+        # Give a one-second grace period for other processes to clean up.
+        time.sleep(SIGTERM_GRACE_PERIOD_SECONDS)
+        # SIGKILL the pgroup (including ourselves) as a last-resort.
+        if sys.platform == "win32":
+            atexit.unregister(sigterm_handler)
+            os.kill(0, signal.CTRL_BREAK_EVENT)
+        else:
+            os.killpg(0, signal.SIGKILL)
+    # Set a SIGTERM handler to handle SIGTERMing ourselves with the group.
+    if sys.platform == "win32":
+        atexit.register(sigterm_handler)
+    else:
+        signal.signal(signal.SIGTERM, sigterm_handler)
+    # Our parent must have died, SIGTERM the group (including ourselves).
+    if sys.platform == "win32":
+        os.kill(0, signal.CTRL_C_EVENT)
+    else:
+        os.killpg(0, signal.SIGTERM)
+def main():
+    # Read from stdout forever. Because stdout is a file descriptor
+    # inherited from our parent process, we will get an EOF if the parent
+    # dies, which is signaled by an empty return from read().
+    # We intentionally don't set any signal handlers here, so a SIGTERM from
+    # the parent can be used to kill this process gracefully without it killing
+    # the rest of the process group.
+    while len(sys.stdin.read()) != 0:
+        pass
+    reap_process_group()
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/_private/resource_spec.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import logging
+import sys
+from collections import namedtuple
+from typing import Optional
+import ray
+import ray._private.ray_constants as ray_constants
+logger = logging.getLogger(__name__)
+# Prefix for the node id resource that is automatically added to each node.
+# For example, a node may have id `node:172.23.42.1`.
+NODE_ID_PREFIX = "node:"
+# The system resource that head node has.
+HEAD_NODE_RESOURCE_NAME = NODE_ID_PREFIX + "__internal_head__"
+class ResourceSpec(
+    namedtuple(
+        "ResourceSpec",
+        [
+            "num_cpus",
+            "num_gpus",
+            "memory",
+            "object_store_memory",
+            "resources",
+            "redis_max_memory",
+        ],
+    )
+):
+    """Represents the resource configuration passed to a raylet.
+    All fields can be None. Before starting services, resolve() should be
+    called to return a ResourceSpec with unknown values filled in with
+    defaults based on the local machine specifications.
+    Attributes:
+        num_cpus: The CPUs allocated for this raylet.
+        num_gpus: The GPUs allocated for this raylet.
+        memory: The memory allocated for this raylet.
+        object_store_memory: The object store memory allocated for this raylet.
+            Note that when calling to_resource_dict(), this will be scaled down
+            by 30% to account for the global plasma LRU reserve.
+        resources: The custom resources allocated for this raylet.
+        redis_max_memory: The max amount of memory (in bytes) to allow each
+            redis shard to use. Once the limit is exceeded, redis will start
+            LRU eviction of entries. This only applies to the sharded redis
+            tables (task, object, and profile tables). By default, this is
+            capped at 10GB but can be set higher.
+    """
+    def __new__(
+        cls,
+        num_cpus=None,
+        num_gpus=None,
+        memory=None,
+        object_store_memory=None,
+        resources=None,
+        redis_max_memory=None,
+    ):
+        return super(ResourceSpec, cls).__new__(
+            cls,
+            num_cpus,
+            num_gpus,
+            memory,
+            object_store_memory,
+            resources,
+            redis_max_memory,
+        )
+    def resolved(self):
+        """Returns if this ResourceSpec has default values filled out."""
+        for v in self._asdict().values():
+            if v is None:
+                return False
+        return True
+    def to_resource_dict(self):
+        """Returns a dict suitable to pass to raylet initialization.
+        This renames num_cpus / num_gpus to "CPU" / "GPU",
+        translates memory from bytes into 100MB memory units, and checks types.
+        """
+        assert self.resolved()
+        resources = dict(
+            self.resources,
+            CPU=self.num_cpus,
+            GPU=self.num_gpus,
+            memory=int(self.memory),
+            object_store_memory=int(self.object_store_memory),
+        )
+        resources = {
+            resource_label: resource_quantity
+            for resource_label, resource_quantity in resources.items()
+            if resource_quantity != 0
+        }
+        # Check types.
+        for resource_label, resource_quantity in resources.items():
+            assert isinstance(resource_quantity, int) or isinstance(
+                resource_quantity, float
+            ), (
+                f"{resource_label} ({type(resource_quantity)}): " f"{resource_quantity}"
+            )
+            if (
+                isinstance(resource_quantity, float)
+                and not resource_quantity.is_integer()
+            ):
+                raise ValueError(
+                    "Resource quantities must all be whole numbers. "
+                    "Violated by resource '{}' in {}.".format(resource_label, resources)
+                )
+            if resource_quantity < 0:
+                raise ValueError(
+                    "Resource quantities must be nonnegative. "
+                    "Violated by resource '{}' in {}.".format(resource_label, resources)
+                )
+            if resource_quantity > ray_constants.MAX_RESOURCE_QUANTITY:
+                raise ValueError(
+                    "Resource quantities must be at most {}. "
+                    "Violated by resource '{}' in {}.".format(
+                        ray_constants.MAX_RESOURCE_QUANTITY, resource_label, resources
+                    )
+                )
+        return resources
+    def resolve(self, is_head: bool, node_ip_address: Optional[str] = None):
+        """Returns a copy with values filled out with system defaults.
+        Args:
+            is_head: Whether this is the head node.
+            node_ip_address: The IP address of the node that we are on.
+                This is used to automatically create a node id resource.
+        """
+        resources = (self.resources or {}).copy()
+        assert "CPU" not in resources, resources
+        assert "GPU" not in resources, resources
+        assert "memory" not in resources, resources
+        assert "object_store_memory" not in resources, resources
+        if node_ip_address is None:
+            node_ip_address = ray.util.get_node_ip_address()
+        # Automatically create a node id resource on each node. This is
+        # queryable with ray._private.state.node_ids() and
+        # ray._private.state.current_node_id().
+        resources[NODE_ID_PREFIX + node_ip_address] = 1.0
+        # Automatically create a head node resource.
+        if HEAD_NODE_RESOURCE_NAME in resources:
+            raise ValueError(
+                f"{HEAD_NODE_RESOURCE_NAME}"
+                " is a reserved resource name, use another name instead."
+            )
+        if is_head:
+            resources[HEAD_NODE_RESOURCE_NAME] = 1.0
+        num_cpus = self.num_cpus
+        if num_cpus is None:
+            num_cpus = ray._private.utils.get_num_cpus()
+        num_gpus = 0
+        for (
+            accelerator_resource_name
+        ) in ray._private.accelerators.get_all_accelerator_resource_names():
+            accelerator_manager = (
+                ray._private.accelerators.get_accelerator_manager_for_resource(
+                    accelerator_resource_name
+                )
+            )
+            num_accelerators = None
+            if accelerator_resource_name == "GPU":
+                num_accelerators = self.num_gpus
+            else:
+                num_accelerators = resources.get(accelerator_resource_name, None)
+            visible_accelerator_ids = (
+                accelerator_manager.get_current_process_visible_accelerator_ids()
+            )
+            # Check that the number of accelerators that the raylet wants doesn't
+            # exceed the amount allowed by visible accelerator ids.
+            if (
+                num_accelerators is not None
+                and visible_accelerator_ids is not None
+                and num_accelerators > len(visible_accelerator_ids)
+            ):
+                raise ValueError(
+                    f"Attempting to start raylet with {num_accelerators} "
+                    f"{accelerator_resource_name}, "
+                    f"but {accelerator_manager.get_visible_accelerator_ids_env_var()} "
+                    f"contains {visible_accelerator_ids}."
+                )
+            if num_accelerators is None:
+                # Try to automatically detect the number of accelerators.
+                num_accelerators = (
+                    accelerator_manager.get_current_node_num_accelerators()
+                )
+                # Don't use more accelerators than allowed by visible accelerator ids.
+                if visible_accelerator_ids is not None:
+                    num_accelerators = min(
+                        num_accelerators, len(visible_accelerator_ids)
+                    )
+            if num_accelerators:
+                if accelerator_resource_name == "GPU":
+                    num_gpus = num_accelerators
+                else:
+                    resources[accelerator_resource_name] = num_accelerators
+                accelerator_type = (
+                    accelerator_manager.get_current_node_accelerator_type()
+                )
+                if accelerator_type:
+                    resources[
+                        f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}"
+                    ] = 1
+                    from ray._private.usage import usage_lib
+                    usage_lib.record_hardware_usage(accelerator_type)
+                additional_resources = (
+                    accelerator_manager.get_current_node_additional_resources()
+                )
+                if additional_resources:
+                    resources.update(additional_resources)
+        # Choose a default object store size.
+        system_memory = ray._private.utils.get_system_memory()
+        avail_memory = ray._private.utils.estimate_available_memory()
+        object_store_memory = self.object_store_memory
+        if object_store_memory is None:
+            object_store_memory = int(
+                avail_memory * ray_constants.DEFAULT_OBJECT_STORE_MEMORY_PROPORTION
+            )
+            # Set the object_store_memory size to 2GB on Mac
+            # to avoid degraded performance.
+            # (https://github.com/ray-project/ray/issues/20388)
+            if sys.platform == "darwin":
+                object_store_memory = min(
+                    object_store_memory, ray_constants.MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT
+                )
+            object_store_memory_cap = (
+                ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES
+            )
+            # Cap by shm size by default to avoid low performance, but don't
+            # go lower than REQUIRE_SHM_SIZE_THRESHOLD.
+            if sys.platform == "linux" or sys.platform == "linux2":
+                # Multiple by 0.95 to give a bit of wiggle-room.
+                # https://github.com/ray-project/ray/pull/23034/files
+                shm_avail = ray._private.utils.get_shared_memory_bytes() * 0.95
+                shm_cap = max(ray_constants.REQUIRE_SHM_SIZE_THRESHOLD, shm_avail)
+                object_store_memory_cap = min(object_store_memory_cap, shm_cap)
+            # Cap memory to avoid memory waste and perf issues on large nodes
+            if (
+                object_store_memory_cap
+                and object_store_memory > object_store_memory_cap
+            ):
+                logger.debug(
+                    "Warning: Capping object memory store to {}GB. ".format(
+                        object_store_memory_cap // 1e9
+                    )
+                    + "To increase this further, specify `object_store_memory` "
+                    "when calling ray.init() or ray start."
+                )
+                object_store_memory = object_store_memory_cap
+        redis_max_memory = self.redis_max_memory
+        if redis_max_memory is None:
+            redis_max_memory = min(
+                ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES,
+                max(int(avail_memory * 0.1), ray_constants.REDIS_MINIMUM_MEMORY_BYTES),
+            )
+        if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES:
+            raise ValueError(
+                "Attempting to cap Redis memory usage at {} bytes, "
+                "but the minimum allowed is {} bytes.".format(
+                    redis_max_memory, ray_constants.REDIS_MINIMUM_MEMORY_BYTES
+                )
+            )
+        memory = self.memory
+        if memory is None:
+            memory = (
+                avail_memory
+                - object_store_memory
+                - (redis_max_memory if is_head else 0)
+            )
+            if memory < 100e6 and memory < 0.05 * system_memory:
+                raise ValueError(
+                    "After taking into account object store and redis memory "
+                    "usage, the amount of memory on this node available for "
+                    "tasks and actors ({} GB) is less than {}% of total. "
+                    "You can adjust these settings with "
+                    "ray.init(memory=<bytes>, "
+                    "object_store_memory=<bytes>).".format(
+                        round(memory / 1e9, 2), int(100 * (memory / system_memory))
+                    )
+                )
+        spec = ResourceSpec(
+            num_cpus,
+            num_gpus,
+            memory,
+            object_store_memory,
+            resources,
+            redis_max_memory,
+        )
+        assert spec.resolved()
+        return spec

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# List of files to exclude from the Ray directory when using runtime_env for
+# Ray development. These are not necessary in the Ray workers.
+RAY_WORKER_DEV_EXCLUDES = ["raylet", "gcs_server", "cpp/", "tests/", "core/src"]

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/_clonevirtualenv.py ADDED Viewed

	@@ -0,0 +1,334 @@

+#!/usr/bin/env python
+from __future__ import with_statement
+import logging
+import optparse
+import os
+import os.path
+import re
+import shutil
+import subprocess
+import sys
+import itertools
+__version__ = "0.5.7"
+logger = logging.getLogger()
+env_bin_dir = "bin"
+if sys.platform == "win32":
+    env_bin_dir = "Scripts"
+    _WIN32 = True
+else:
+    _WIN32 = False
+class UserError(Exception):
+    pass
+def _dirmatch(path, matchwith):
+    """Check if path is within matchwith's tree.
+    >>> _dirmatch('/home/foo/bar', '/home/foo/bar')
+    True
+    >>> _dirmatch('/home/foo/bar/', '/home/foo/bar')
+    True
+    >>> _dirmatch('/home/foo/bar/etc', '/home/foo/bar')
+    True
+    >>> _dirmatch('/home/foo/bar2', '/home/foo/bar')
+    False
+    >>> _dirmatch('/home/foo/bar2/etc', '/home/foo/bar')
+    False
+    """
+    matchlen = len(matchwith)
+    if path.startswith(matchwith) and path[matchlen : matchlen + 1] in [os.sep, ""]:
+        return True
+    return False
+def _virtualenv_sys(venv_path):
+    """obtain version and path info from a virtualenv."""
+    executable = os.path.join(venv_path, env_bin_dir, "python")
+    if _WIN32:
+        env = os.environ.copy()
+    else:
+        env = {}
+    # Must use "executable" as the first argument rather than as the
+    # keyword argument "executable" to get correct value from sys.path
+    p = subprocess.Popen(
+        [
+            executable,
+            "-c",
+            "import sys;"
+            'print ("%d.%d" % (sys.version_info.major, sys.version_info.minor));'
+            'print ("\\n".join(sys.path));',
+        ],
+        env=env,
+        stdout=subprocess.PIPE,
+    )
+    stdout, err = p.communicate()
+    assert not p.returncode and stdout
+    lines = stdout.decode("utf-8").splitlines()
+    return lines[0], list(filter(bool, lines[1:]))
+def clone_virtualenv(src_dir, dst_dir):
+    if not os.path.exists(src_dir):
+        raise UserError("src dir %r does not exist" % src_dir)
+    if os.path.exists(dst_dir):
+        raise UserError("dest dir %r exists" % dst_dir)
+    # sys_path = _virtualenv_syspath(src_dir)
+    logger.info("cloning virtualenv '%s' => '%s'..." % (src_dir, dst_dir))
+    shutil.copytree(
+        src_dir, dst_dir, symlinks=True, ignore=shutil.ignore_patterns("*.pyc")
+    )
+    version, sys_path = _virtualenv_sys(dst_dir)
+    logger.info("fixing scripts in bin...")
+    fixup_scripts(src_dir, dst_dir, version)
+    has_old = lambda s: any(i for i in s if _dirmatch(i, src_dir))  # noqa: E731
+    if has_old(sys_path):
+        # only need to fix stuff in sys.path if we have old
+        # paths in the sys.path of new python env. right?
+        logger.info("fixing paths in sys.path...")
+        fixup_syspath_items(sys_path, src_dir, dst_dir)
+    v_sys = _virtualenv_sys(dst_dir)
+    remaining = has_old(v_sys[1])
+    assert not remaining, v_sys
+    fix_symlink_if_necessary(src_dir, dst_dir)
+def fix_symlink_if_necessary(src_dir, dst_dir):
+    # sometimes the source virtual environment has symlinks that point to itself
+    # one example is $OLD_VIRTUAL_ENV/local/lib points to $OLD_VIRTUAL_ENV/lib
+    # this function makes sure
+    # $NEW_VIRTUAL_ENV/local/lib will point to $NEW_VIRTUAL_ENV/lib
+    # usually this goes unnoticed unless one tries to upgrade a package though pip,
+    # so this bug is hard to find.
+    logger.info("scanning for internal symlinks that point to the original virtual env")
+    for dirpath, dirnames, filenames in os.walk(dst_dir):
+        for a_file in itertools.chain(filenames, dirnames):
+            full_file_path = os.path.join(dirpath, a_file)
+            if os.path.islink(full_file_path):
+                target = os.path.realpath(full_file_path)
+                if target.startswith(src_dir):
+                    new_target = target.replace(src_dir, dst_dir)
+                    logger.debug("fixing symlink in %s" % (full_file_path,))
+                    os.remove(full_file_path)
+                    os.symlink(new_target, full_file_path)
+def fixup_scripts(old_dir, new_dir, version, rewrite_env_python=False):
+    bin_dir = os.path.join(new_dir, env_bin_dir)
+    root, dirs, files = next(os.walk(bin_dir))
+    pybinre = re.compile(r"pythonw?([0-9]+(\.[0-9]+(\.[0-9]+)?)?)?$")
+    for file_ in files:
+        filename = os.path.join(root, file_)
+        if file_ in ["python", "python%s" % version, "activate_this.py"]:
+            continue
+        elif file_.startswith("python") and pybinre.match(file_):
+            # ignore other possible python binaries
+            continue
+        elif file_.endswith(".pyc"):
+            # ignore compiled files
+            continue
+        elif file_ == "activate" or file_.startswith("activate."):
+            fixup_activate(os.path.join(root, file_), old_dir, new_dir)
+        elif os.path.islink(filename):
+            fixup_link(filename, old_dir, new_dir)
+        elif os.path.isfile(filename):
+            fixup_script_(
+                root,
+                file_,
+                old_dir,
+                new_dir,
+                version,
+                rewrite_env_python=rewrite_env_python,
+            )
+def fixup_script_(root, file_, old_dir, new_dir, version, rewrite_env_python=False):
+    old_shebang = "#!%s/bin/python" % os.path.normcase(os.path.abspath(old_dir))
+    new_shebang = "#!%s/bin/python" % os.path.normcase(os.path.abspath(new_dir))
+    env_shebang = "#!/usr/bin/env python"
+    filename = os.path.join(root, file_)
+    with open(filename, "rb") as f:
+        if f.read(2) != b"#!":
+            # no shebang
+            return
+        f.seek(0)
+        lines = f.readlines()
+    if not lines:
+        # warn: empty script
+        return
+    def rewrite_shebang(version=None):
+        logger.debug("fixing %s" % filename)
+        shebang = new_shebang
+        if version:
+            shebang = shebang + version
+        shebang = (shebang + "\n").encode("utf-8")
+        with open(filename, "wb") as f:
+            f.write(shebang)
+            f.writelines(lines[1:])
+    try:
+        bang = lines[0].decode("utf-8").strip()
+    except UnicodeDecodeError:
+        # binary file
+        return
+    # This takes care of the scheme in which shebang is of type
+    # '#!/venv/bin/python3' while the version of system python
+    # is of type 3.x e.g. 3.5.
+    short_version = bang[len(old_shebang) :]
+    if not bang.startswith("#!"):
+        return
+    elif bang == old_shebang:
+        rewrite_shebang()
+    elif bang.startswith(old_shebang) and bang[len(old_shebang) :] == version:
+        rewrite_shebang(version)
+    elif (
+        bang.startswith(old_shebang)
+        and short_version
+        and bang[len(old_shebang) :] == short_version
+    ):
+        rewrite_shebang(short_version)
+    elif rewrite_env_python and bang.startswith(env_shebang):
+        if bang == env_shebang:
+            rewrite_shebang()
+        elif bang[len(env_shebang) :] == version:
+            rewrite_shebang(version)
+    else:
+        # can't do anything
+        return
+def fixup_activate(filename, old_dir, new_dir):
+    logger.debug("fixing %s" % filename)
+    with open(filename, "rb") as f:
+        data = f.read().decode("utf-8")
+    data = data.replace(old_dir, new_dir)
+    with open(filename, "wb") as f:
+        f.write(data.encode("utf-8"))
+def fixup_link(filename, old_dir, new_dir, target=None):
+    logger.debug("fixing %s" % filename)
+    if target is None:
+        target = os.readlink(filename)
+    origdir = os.path.dirname(os.path.abspath(filename)).replace(new_dir, old_dir)
+    if not os.path.isabs(target):
+        target = os.path.abspath(os.path.join(origdir, target))
+        rellink = True
+    else:
+        rellink = False
+    if _dirmatch(target, old_dir):
+        if rellink:
+            # keep relative links, but don't keep original in case it
+            # traversed up out of, then back into the venv.
+            # so, recreate a relative link from absolute.
+            target = target[len(origdir) :].lstrip(os.sep)
+        else:
+            target = target.replace(old_dir, new_dir, 1)
+    # else: links outside the venv, replaced with absolute path to target.
+    _replace_symlink(filename, target)
+def _replace_symlink(filename, newtarget):
+    tmpfn = "%s.new" % filename
+    os.symlink(newtarget, tmpfn)
+    os.rename(tmpfn, filename)
+def fixup_syspath_items(syspath, old_dir, new_dir):
+    for path in syspath:
+        if not os.path.isdir(path):
+            continue
+        path = os.path.normcase(os.path.abspath(path))
+        if _dirmatch(path, old_dir):
+            path = path.replace(old_dir, new_dir, 1)
+            if not os.path.exists(path):
+                continue
+        elif not _dirmatch(path, new_dir):
+            continue
+        root, dirs, files = next(os.walk(path))
+        for file_ in files:
+            filename = os.path.join(root, file_)
+            if filename.endswith(".pth"):
+                fixup_pth_file(filename, old_dir, new_dir)
+            elif filename.endswith(".egg-link"):
+                fixup_egglink_file(filename, old_dir, new_dir)
+def fixup_pth_file(filename, old_dir, new_dir):
+    logger.debug("fixup_pth_file %s" % filename)
+    with open(filename, "r") as f:
+        lines = f.readlines()
+    has_change = False
+    for num, line in enumerate(lines):
+        line = (line.decode("utf-8") if hasattr(line, "decode") else line).strip()
+        if not line or line.startswith("#") or line.startswith("import "):
+            continue
+        elif _dirmatch(line, old_dir):
+            lines[num] = line.replace(old_dir, new_dir, 1)
+            has_change = True
+    if has_change:
+        with open(filename, "w") as f:
+            payload = os.linesep.join([line.strip() for line in lines]) + os.linesep
+            f.write(payload)
+def fixup_egglink_file(filename, old_dir, new_dir):
+    logger.debug("fixing %s" % filename)
+    with open(filename, "rb") as f:
+        link = f.read().decode("utf-8").strip()
+    if _dirmatch(link, old_dir):
+        link = link.replace(old_dir, new_dir, 1)
+        with open(filename, "wb") as f:
+            link = (link + "\n").encode("utf-8")
+            f.write(link)
+def main():
+    parser = optparse.OptionParser(
+        "usage: %prog [options] /path/to/existing/venv /path/to/cloned/venv"
+    )
+    parser.add_option(
+        "-v", action="count", dest="verbose", default=False, help="verbosity"
+    )
+    options, args = parser.parse_args()
+    try:
+        old_dir, new_dir = args
+    except ValueError:
+        print("virtualenv-clone %s" % (__version__,))
+        parser.error("not enough arguments given.")
+    old_dir = os.path.realpath(old_dir)
+    new_dir = os.path.realpath(new_dir)
+    loglevel = (logging.WARNING, logging.INFO, logging.DEBUG)[min(2, options.verbose)]
+    logging.basicConfig(level=loglevel, format="%(message)s")
+    try:
+        clone_virtualenv(old_dir, new_dir)
+    except UserError:
+        e = sys.exc_info()[1]
+        parser.error(str(e))
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/conda.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import hashlib
+import json
+import logging
+import os
+import platform
+import runpy
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import yaml
+from filelock import FileLock
+import ray
+from ray._private.runtime_env.conda_utils import (
+    create_conda_env_if_needed,
+    delete_conda_env,
+    get_conda_activate_commands,
+    get_conda_info_json,
+    get_conda_envs,
+)
+from ray._private.runtime_env.context import RuntimeEnvContext
+from ray._private.runtime_env.packaging import Protocol, parse_uri
+from ray._private.runtime_env.plugin import RuntimeEnvPlugin
+from ray._private.runtime_env.validation import parse_and_validate_conda
+from ray._private.utils import (
+    get_directory_size_bytes,
+    get_master_wheel_url,
+    get_or_create_event_loop,
+    get_release_wheel_url,
+    get_wheel_filename,
+    try_to_create_directory,
+)
+default_logger = logging.getLogger(__name__)
+_WIN32 = os.name == "nt"
+def _resolve_current_ray_path() -> str:
+    # When ray is built from source with pip install -e,
+    # ray.__file__ returns .../python/ray/__init__.py and this function returns
+    # ".../python".
+    # When ray is installed from a prebuilt binary, ray.__file__ returns
+    # .../site-packages/ray/__init__.py and this function returns
+    # ".../site-packages".
+    return os.path.split(os.path.split(ray.__file__)[0])[0]
+def _get_ray_setup_spec():
+    """Find the Ray setup_spec from the currently running Ray.
+    This function works even when Ray is built from source with pip install -e.
+    """
+    ray_source_python_path = _resolve_current_ray_path()
+    setup_py_path = os.path.join(ray_source_python_path, "setup.py")
+    return runpy.run_path(setup_py_path)["setup_spec"]
+def _resolve_install_from_source_ray_dependencies():
+    """Find the Ray dependencies when Ray is installed from source."""
+    deps = (
+        _get_ray_setup_spec().install_requires + _get_ray_setup_spec().extras["default"]
+    )
+    # Remove duplicates
+    return list(set(deps))
+def _inject_ray_to_conda_site(
+    conda_path, logger: Optional[logging.Logger] = default_logger
+):
+    """Write the current Ray site package directory to a new site"""
+    if _WIN32:
+        python_binary = os.path.join(conda_path, "python")
+    else:
+        python_binary = os.path.join(conda_path, "bin/python")
+    site_packages_path = (
+        subprocess.check_output(
+            [
+                python_binary,
+                "-c",
+                "import sysconfig; print(sysconfig.get_paths()['purelib'])",
+            ]
+        )
+        .decode()
+        .strip()
+    )
+    ray_path = _resolve_current_ray_path()
+    logger.warning(
+        f"Injecting {ray_path} to environment site-packages {site_packages_path} "
+        "because _inject_current_ray flag is on."
+    )
+    maybe_ray_dir = os.path.join(site_packages_path, "ray")
+    if os.path.isdir(maybe_ray_dir):
+        logger.warning(f"Replacing existing ray installation with {ray_path}")
+        shutil.rmtree(maybe_ray_dir)
+    # See usage of *.pth file at
+    # https://docs.python.org/3/library/site.html
+    with open(os.path.join(site_packages_path, "ray_shared.pth"), "w") as f:
+        f.write(ray_path)
+def _current_py_version():
+    return ".".join(map(str, sys.version_info[:3]))  # like 3.6.10
+def _is_m1_mac():
+    return sys.platform == "darwin" and platform.machine() == "arm64"
+def current_ray_pip_specifier(
+    logger: Optional[logging.Logger] = default_logger,
+) -> Optional[str]:
+    """The pip requirement specifier for the running version of Ray.
+    Returns:
+        A string which can be passed to `pip install` to install the
+        currently running Ray version, or None if running on a version
+        built from source locally (likely if you are developing Ray).
+    Examples:
+        Returns "https://s3-us-west-2.amazonaws.com/ray-wheels/[..].whl"
+            if running a stable release, a nightly or a specific commit
+    """
+    if os.environ.get("RAY_CI_POST_WHEEL_TESTS"):
+        # Running in Buildkite CI after the wheel has been built.
+        # Wheels are at in the ray/.whl directory, but use relative path to
+        # allow for testing locally if needed.
+        return os.path.join(
+            Path(ray.__file__).resolve().parents[2], ".whl", get_wheel_filename()
+        )
+    elif ray.__commit__ == "{{RAY_COMMIT_SHA}}":
+        # Running on a version built from source locally.
+        if os.environ.get("RAY_RUNTIME_ENV_LOCAL_DEV_MODE") != "1":
+            logger.warning(
+                "Current Ray version could not be detected, most likely "
+                "because you have manually built Ray from source.  To use "
+                "runtime_env in this case, set the environment variable "
+                "RAY_RUNTIME_ENV_LOCAL_DEV_MODE=1."
+            )
+        return None
+    elif "dev" in ray.__version__:
+        # Running on a nightly wheel.
+        if _is_m1_mac():
+            raise ValueError("Nightly wheels are not available for M1 Macs.")
+        return get_master_wheel_url()
+    else:
+        if _is_m1_mac():
+            # M1 Mac release wheels are currently not uploaded to AWS S3; they
+            # are only available on PyPI.  So unfortunately, this codepath is
+            # not end-to-end testable prior to the release going live on PyPI.
+            return f"ray=={ray.__version__}"
+        else:
+            return get_release_wheel_url()
+def inject_dependencies(
+    conda_dict: Dict[Any, Any],
+    py_version: str,
+    pip_dependencies: Optional[List[str]] = None,
+) -> Dict[Any, Any]:
+    """Add Ray, Python and (optionally) extra pip dependencies to a conda dict.
+    Args:
+        conda_dict: A dict representing the JSON-serialized conda
+            environment YAML file.  This dict will be modified and returned.
+        py_version: A string representing a Python version to inject
+            into the conda dependencies, e.g. "3.7.7"
+        pip_dependencies (List[str]): A list of pip dependencies that
+            will be prepended to the list of pip dependencies in
+            the conda dict.  If the conda dict does not already have a "pip"
+            field, one will be created.
+    Returns:
+        The modified dict.  (Note: the input argument conda_dict is modified
+        and returned.)
+    """
+    if pip_dependencies is None:
+        pip_dependencies = []
+    if conda_dict.get("dependencies") is None:
+        conda_dict["dependencies"] = []
+    # Inject Python dependency.
+    deps = conda_dict["dependencies"]
+    # Add current python dependency.  If the user has already included a
+    # python version dependency, conda will raise a readable error if the two
+    # are incompatible, e.g:
+    #   ResolvePackageNotFound: - python[version='3.5.*,>=3.6']
+    deps.append(f"python={py_version}")
+    if "pip" not in deps:
+        deps.append("pip")
+    # Insert pip dependencies.
+    found_pip_dict = False
+    for dep in deps:
+        if isinstance(dep, dict) and dep.get("pip") and isinstance(dep["pip"], list):
+            dep["pip"] = pip_dependencies + dep["pip"]
+            found_pip_dict = True
+            break
+    if not found_pip_dict:
+        deps.append({"pip": pip_dependencies})
+    return conda_dict
+def _get_conda_env_hash(conda_dict: Dict) -> str:
+    # Set `sort_keys=True` so that different orderings yield the same hash.
+    serialized_conda_spec = json.dumps(conda_dict, sort_keys=True)
+    hash = hashlib.sha1(serialized_conda_spec.encode("utf-8")).hexdigest()
+    return hash
+def get_uri(runtime_env: Dict) -> Optional[str]:
+    """Return `"conda://<hashed_dependencies>"`, or None if no GC required."""
+    conda = runtime_env.get("conda")
+    if conda is not None:
+        if isinstance(conda, str):
+            # User-preinstalled conda env.  We don't garbage collect these, so
+            # we don't track them with URIs.
+            uri = None
+        elif isinstance(conda, dict):
+            uri = f"conda://{_get_conda_env_hash(conda_dict=conda)}"
+        else:
+            raise TypeError(
+                "conda field received by RuntimeEnvAgent must be "
+                f"str or dict, not {type(conda).__name__}."
+            )
+    else:
+        uri = None
+    return uri
+def _get_conda_dict_with_ray_inserted(
+    runtime_env: "RuntimeEnv",  # noqa: F821
+    logger: Optional[logging.Logger] = default_logger,
+) -> Dict[str, Any]:
+    """Returns the conda spec with the Ray and `python` dependency inserted."""
+    conda_dict = json.loads(runtime_env.conda_config())
+    assert conda_dict is not None
+    ray_pip = current_ray_pip_specifier(logger=logger)
+    if ray_pip:
+        extra_pip_dependencies = [ray_pip, "ray[default]"]
+    elif runtime_env.get_extension("_inject_current_ray"):
+        extra_pip_dependencies = _resolve_install_from_source_ray_dependencies()
+    else:
+        extra_pip_dependencies = []
+    conda_dict = inject_dependencies(
+        conda_dict, _current_py_version(), extra_pip_dependencies
+    )
+    return conda_dict
+class CondaPlugin(RuntimeEnvPlugin):
+    name = "conda"
+    def __init__(self, resources_dir: str):
+        self._resources_dir = os.path.join(resources_dir, "conda")
+        try_to_create_directory(self._resources_dir)
+        # It is not safe for multiple processes to install conda envs
+        # concurrently, even if the envs are different, so use a global
+        # lock for all conda installs and deletions.
+        # See https://github.com/ray-project/ray/issues/17086
+        self._installs_and_deletions_file_lock = os.path.join(
+            self._resources_dir, "ray-conda-installs-and-deletions.lock"
+        )
+        # A set of named conda environments (instead of yaml or dict)
+        # that are validated to exist.
+        # NOTE: It has to be only used within the same thread, which
+        # is an event loop.
+        # Also, we don't need to GC this field because it is pretty small.
+        self._validated_named_conda_env = set()
+    def _get_path_from_hash(self, hash: str) -> str:
+        """Generate a path from the hash of a conda or pip spec.
+        The output path also functions as the name of the conda environment
+        when using the `--prefix` option to `conda create` and `conda remove`.
+        Example output:
+            /tmp/ray/session_2021-11-03_16-33-59_356303_41018/runtime_resources
+                /conda/ray-9a7972c3a75f55e976e620484f58410c920db091
+        """
+        return os.path.join(self._resources_dir, hash)
+    def get_uris(self, runtime_env: "RuntimeEnv") -> List[str]:  # noqa: F821
+        """Return the conda URI from the RuntimeEnv if it exists, else return []."""
+        conda_uri = runtime_env.conda_uri()
+        if conda_uri:
+            return [conda_uri]
+        return []
+    def delete_uri(
+        self, uri: str, logger: Optional[logging.Logger] = default_logger
+    ) -> int:
+        """Delete URI and return the number of bytes deleted."""
+        logger.info(f"Got request to delete URI {uri}")
+        protocol, hash = parse_uri(uri)
+        if protocol != Protocol.CONDA:
+            raise ValueError(
+                "CondaPlugin can only delete URIs with protocol "
+                f"conda.  Received protocol {protocol}, URI {uri}"
+            )
+        conda_env_path = self._get_path_from_hash(hash)
+        local_dir_size = get_directory_size_bytes(conda_env_path)
+        with FileLock(self._installs_and_deletions_file_lock):
+            successful = delete_conda_env(prefix=conda_env_path, logger=logger)
+        if not successful:
+            logger.warning(f"Error when deleting conda env {conda_env_path}. ")
+            return 0
+        return local_dir_size
+    async def create(
+        self,
+        uri: Optional[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: logging.Logger = default_logger,
+    ) -> int:
+        if not runtime_env.has_conda():
+            return 0
+        def _create():
+            result = parse_and_validate_conda(runtime_env.get("conda"))
+            if isinstance(result, str):
+                # The conda env name is given.
+                # In this case, we only verify if the given
+                # conda env exists.
+                # If the env is already validated, do nothing.
+                if result in self._validated_named_conda_env:
+                    return 0
+                conda_info = get_conda_info_json()
+                envs = get_conda_envs(conda_info)
+                # We accept `result` as a conda name or full path.
+                if not any(result == env[0] or result == env[1] for env in envs):
+                    raise ValueError(
+                        f"The given conda environment '{result}' "
+                        f"from the runtime env {runtime_env} doesn't "
+                        "exist from the output of `conda info --json`. "
+                        "You can only specify an env that already exists. "
+                        f"Please make sure to create an env {result} "
+                    )
+                self._validated_named_conda_env.add(result)
+                return 0
+            logger.debug(
+                "Setting up conda for runtime_env: " f"{runtime_env.serialize()}"
+            )
+            protocol, hash = parse_uri(uri)
+            conda_env_name = self._get_path_from_hash(hash)
+            conda_dict = _get_conda_dict_with_ray_inserted(runtime_env, logger=logger)
+            logger.info(f"Setting up conda environment with {runtime_env}")
+            with FileLock(self._installs_and_deletions_file_lock):
+                try:
+                    conda_yaml_file = os.path.join(
+                        self._resources_dir, "environment.yml"
+                    )
+                    with open(conda_yaml_file, "w") as file:
+                        yaml.dump(conda_dict, file)
+                    create_conda_env_if_needed(
+                        conda_yaml_file, prefix=conda_env_name, logger=logger
+                    )
+                finally:
+                    os.remove(conda_yaml_file)
+                if runtime_env.get_extension("_inject_current_ray"):
+                    _inject_ray_to_conda_site(conda_path=conda_env_name, logger=logger)
+            logger.info(f"Finished creating conda environment at {conda_env_name}")
+            return get_directory_size_bytes(conda_env_name)
+        loop = get_or_create_event_loop()
+        return await loop.run_in_executor(None, _create)
+    def modify_context(
+        self,
+        uris: List[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,
+    ):
+        if not runtime_env.has_conda():
+            return
+        if runtime_env.conda_env_name():
+            conda_env_name = runtime_env.conda_env_name()
+        else:
+            protocol, hash = parse_uri(runtime_env.conda_uri())
+            conda_env_name = self._get_path_from_hash(hash)
+        context.py_executable = "python"
+        context.command_prefix += get_conda_activate_commands(conda_env_name)

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/conda_utils.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import logging
+import os
+import shutil
+import subprocess
+import hashlib
+import json
+from typing import Optional, List, Union, Tuple
+"""Utilities for conda.  Adapted from https://github.com/mlflow/mlflow."""
+# Name of environment variable indicating a path to a conda installation. Ray
+# will default to running "conda" if unset.
+RAY_CONDA_HOME = "RAY_CONDA_HOME"
+_WIN32 = os.name == "nt"
+def get_conda_activate_commands(conda_env_name: str) -> List[str]:
+    """
+    Get a list of commands to run to silently activate the given conda env.
+    """
+    #  Checking for newer conda versions
+    if not _WIN32 and ("CONDA_EXE" in os.environ or RAY_CONDA_HOME in os.environ):
+        conda_path = get_conda_bin_executable("conda")
+        activate_conda_env = [
+            ".",
+            f"{os.path.dirname(conda_path)}/../etc/profile.d/conda.sh",
+            "&&",
+        ]
+        activate_conda_env += ["conda", "activate", conda_env_name]
+    else:
+        activate_path = get_conda_bin_executable("activate")
+        if not _WIN32:
+            # Use bash command syntax
+            activate_conda_env = ["source", activate_path, conda_env_name]
+        else:
+            activate_conda_env = ["conda", "activate", conda_env_name]
+    return activate_conda_env + ["1>&2", "&&"]
+def get_conda_bin_executable(executable_name: str) -> str:
+    """
+    Return path to the specified executable, assumed to be discoverable within
+    a conda installation.
+    The conda home directory (expected to contain a 'bin' subdirectory on
+    linux) is configurable via the ``RAY_CONDA_HOME`` environment variable. If
+    ``RAY_CONDA_HOME`` is unspecified, try the ``CONDA_EXE`` environment
+    variable set by activating conda. If neither is specified, this method
+    returns `executable_name`.
+    """
+    conda_home = os.environ.get(RAY_CONDA_HOME)
+    if conda_home:
+        if _WIN32:
+            candidate = os.path.join(conda_home, "%s.exe" % executable_name)
+            if os.path.exists(candidate):
+                return candidate
+            candidate = os.path.join(conda_home, "%s.bat" % executable_name)
+            if os.path.exists(candidate):
+                return candidate
+        else:
+            return os.path.join(conda_home, "bin/%s" % executable_name)
+    else:
+        conda_home = "."
+    # Use CONDA_EXE as per https://github.com/conda/conda/issues/7126
+    if "CONDA_EXE" in os.environ:
+        conda_bin_dir = os.path.dirname(os.environ["CONDA_EXE"])
+        if _WIN32:
+            candidate = os.path.join(conda_home, "%s.exe" % executable_name)
+            if os.path.exists(candidate):
+                return candidate
+            candidate = os.path.join(conda_home, "%s.bat" % executable_name)
+            if os.path.exists(candidate):
+                return candidate
+        else:
+            return os.path.join(conda_bin_dir, executable_name)
+    if _WIN32:
+        return executable_name + ".bat"
+    return executable_name
+def _get_conda_env_name(conda_env_path: str) -> str:
+    conda_env_contents = open(conda_env_path).read()
+    return "ray-%s" % hashlib.sha1(conda_env_contents.encode("utf-8")).hexdigest()
+def create_conda_env_if_needed(
+    conda_yaml_file: str, prefix: str, logger: Optional[logging.Logger] = None
+) -> None:
+    """
+    Given a conda YAML, creates a conda environment containing the required
+    dependencies if such a conda environment doesn't already exist.
+    Args:
+        conda_yaml_file: The path to a conda `environment.yml` file.
+        prefix: Directory to install the environment into via
+            the `--prefix` option to conda create.  This also becomes the name
+            of the conda env; i.e. it can be passed into `conda activate` and
+            `conda remove`
+    """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+    conda_path = get_conda_bin_executable("conda")
+    try:
+        exec_cmd([conda_path, "--help"], throw_on_error=False)
+    except (EnvironmentError, FileNotFoundError):
+        raise ValueError(
+            f"Could not find Conda executable at '{conda_path}'. "
+            "Ensure Conda is installed as per the instructions at "
+            "https://conda.io/projects/conda/en/latest/"
+            "user-guide/install/index.html. "
+            "You can also configure Ray to look for a specific "
+            f"Conda executable by setting the {RAY_CONDA_HOME} "
+            "environment variable to the path of the Conda executable."
+        )
+    _, stdout, _ = exec_cmd([conda_path, "env", "list", "--json"])
+    envs = json.loads(stdout[stdout.index("{") :])["envs"]
+    if prefix in envs:
+        logger.info(f"Conda environment {prefix} already exists.")
+        return
+    create_cmd = [
+        conda_path,
+        "env",
+        "create",
+        "--file",
+        conda_yaml_file,
+        "--prefix",
+        prefix,
+    ]
+    logger.info(f"Creating conda environment {prefix}")
+    exit_code, output = exec_cmd_stream_to_logger(create_cmd, logger)
+    if exit_code != 0:
+        if os.path.exists(prefix):
+            shutil.rmtree(prefix)
+        raise RuntimeError(
+            f"Failed to install conda environment {prefix}:\nOutput:\n{output}"
+        )
+def delete_conda_env(prefix: str, logger: Optional[logging.Logger] = None) -> bool:
+    if logger is None:
+        logger = logging.getLogger(__name__)
+    logger.info(f"Deleting conda environment {prefix}")
+    conda_path = get_conda_bin_executable("conda")
+    delete_cmd = [conda_path, "remove", "-p", prefix, "--all", "-y"]
+    exit_code, output = exec_cmd_stream_to_logger(delete_cmd, logger)
+    if exit_code != 0:
+        logger.debug(f"Failed to delete conda environment {prefix}:\n{output}")
+        return False
+    return True
+def get_conda_env_list() -> list:
+    """
+    Get conda env list in full paths.
+    """
+    conda_path = get_conda_bin_executable("conda")
+    try:
+        exec_cmd([conda_path, "--help"], throw_on_error=False)
+    except EnvironmentError:
+        raise ValueError(f"Could not find Conda executable at {conda_path}.")
+    _, stdout, _ = exec_cmd([conda_path, "env", "list", "--json"])
+    envs = json.loads(stdout)["envs"]
+    return envs
+def get_conda_info_json() -> dict:
+    """
+    Get `conda info --json` output.
+    Returns dict of conda info. See [1] for more details. We mostly care about these
+    keys:
+    - `conda_prefix`: str The path to the conda installation.
+    - `envs`: List[str] absolute paths to conda environments.
+    [1] https://github.com/conda/conda/blob/main/conda/cli/main_info.py
+    """
+    conda_path = get_conda_bin_executable("conda")
+    try:
+        exec_cmd([conda_path, "--help"], throw_on_error=False)
+    except EnvironmentError:
+        raise ValueError(f"Could not find Conda executable at {conda_path}.")
+    _, stdout, _ = exec_cmd([conda_path, "info", "--json"])
+    return json.loads(stdout)
+def get_conda_envs(conda_info: dict) -> List[Tuple[str, str]]:
+    """
+    Gets the conda environments, as a list of (name, path) tuples.
+    """
+    prefix = conda_info["conda_prefix"]
+    ret = []
+    for env in conda_info["envs"]:
+        if env == prefix:
+            ret.append(("base", env))
+        else:
+            ret.append((os.path.basename(env), env))
+    return ret
+class ShellCommandException(Exception):
+    pass
+def exec_cmd(
+    cmd: List[str], throw_on_error: bool = True, logger: Optional[logging.Logger] = None
+) -> Union[int, Tuple[int, str, str]]:
+    """
+    Runs a command as a child process.
+    A convenience wrapper for running a command from a Python script.
+    Note on the return value: A tuple of the exit code,
+    standard output and standard error is returned.
+    Args:
+        cmd: the command to run, as a list of strings
+        throw_on_error: if true, raises an Exception if the exit code of the
+            program is nonzero
+    """
+    child = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stdin=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    )
+    (stdout, stderr) = child.communicate()
+    exit_code = child.wait()
+    if throw_on_error and exit_code != 0:
+        raise ShellCommandException(
+            "Non-zero exit code: %s\n\nSTDOUT:\n%s\n\nSTDERR:%s"
+            % (exit_code, stdout, stderr)
+        )
+    return exit_code, stdout, stderr
+def exec_cmd_stream_to_logger(
+    cmd: List[str], logger: logging.Logger, n_lines: int = 50, **kwargs
+) -> Tuple[int, str]:
+    """Runs a command as a child process, streaming output to the logger.
+    The last n_lines lines of output are also returned (stdout and stderr).
+    """
+    if "env" in kwargs and _WIN32 and "PATH" not in [x.upper() for x in kwargs.keys]:
+        raise ValueError("On windows, Popen requires 'PATH' in 'env'")
+    child = subprocess.Popen(
+        cmd,
+        universal_newlines=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        **kwargs,
+    )
+    last_n_lines = []
+    with child.stdout:
+        for line in iter(child.stdout.readline, b""):
+            exit_code = child.poll()
+            if exit_code is not None:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            last_n_lines.append(line.strip())
+            last_n_lines = last_n_lines[-n_lines:]
+            logger.info(line.strip())
+    exit_code = child.wait()
+    return exit_code, "\n".join(last_n_lines)

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/constants.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Env var set by job manager to pass runtime env and metadata to subprocess
+RAY_JOB_CONFIG_JSON_ENV_VAR = "RAY_JOB_CONFIG_JSON_ENV_VAR"
+# The plugin config which should be loaded when ray cluster starts.
+# It is a json formatted config,
+# e.g. [{"class": "xxx.xxx.xxx_plugin", "priority": 10}].
+RAY_RUNTIME_ENV_PLUGINS_ENV_VAR = "RAY_RUNTIME_ENV_PLUGINS"
+# The field name of plugin class in the plugin config.
+RAY_RUNTIME_ENV_CLASS_FIELD_NAME = "class"
+# The field name of priority in the plugin config.
+RAY_RUNTIME_ENV_PRIORITY_FIELD_NAME = "priority"
+# The default priority of runtime env plugin.
+RAY_RUNTIME_ENV_PLUGIN_DEFAULT_PRIORITY = 10
+# The minimum priority of runtime env plugin.
+RAY_RUNTIME_ENV_PLUGIN_MIN_PRIORITY = 0
+# The maximum priority of runtime env plugin.
+RAY_RUNTIME_ENV_PLUGIN_MAX_PRIORITY = 100
+# The schema files or directories of plugins which should be loaded in workers.
+RAY_RUNTIME_ENV_PLUGIN_SCHEMAS_ENV_VAR = "RAY_RUNTIME_ENV_PLUGIN_SCHEMAS"
+# The file suffix of runtime env plugin schemas.
+RAY_RUNTIME_ENV_PLUGIN_SCHEMA_SUFFIX = ".json"

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/context.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+import logging
+import os
+import subprocess
+import shlex
+import sys
+from typing import Dict, List, Optional
+from ray.util.annotations import DeveloperAPI
+from ray.core.generated.common_pb2 import Language
+from ray._private.services import get_ray_jars_dir
+from ray._private.utils import update_envs
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class RuntimeEnvContext:
+    """A context used to describe the created runtime env."""
+    def __init__(
+        self,
+        command_prefix: List[str] = None,
+        env_vars: Dict[str, str] = None,
+        py_executable: Optional[str] = None,
+        override_worker_entrypoint: Optional[str] = None,
+        java_jars: List[str] = None,
+    ):
+        self.command_prefix = command_prefix or []
+        self.env_vars = env_vars or {}
+        self.py_executable = py_executable or sys.executable
+        self.override_worker_entrypoint: Optional[str] = override_worker_entrypoint
+        self.java_jars = java_jars or []
+    def serialize(self) -> str:
+        return json.dumps(self.__dict__)
+    @staticmethod
+    def deserialize(json_string):
+        return RuntimeEnvContext(**json.loads(json_string))
+    def exec_worker(self, passthrough_args: List[str], language: Language):
+        update_envs(self.env_vars)
+        if language == Language.PYTHON and sys.platform == "win32":
+            executable = [self.py_executable]
+        elif language == Language.PYTHON:
+            executable = ["exec", self.py_executable]
+        elif language == Language.JAVA:
+            executable = ["java"]
+            ray_jars = os.path.join(get_ray_jars_dir(), "*")
+            local_java_jars = []
+            for java_jar in self.java_jars:
+                local_java_jars.append(f"{java_jar}/*")
+                local_java_jars.append(java_jar)
+            class_path_args = ["-cp", ray_jars + ":" + str(":".join(local_java_jars))]
+            passthrough_args = class_path_args + passthrough_args
+        elif sys.platform == "win32":
+            executable = []
+        else:
+            executable = ["exec"]
+        # By default, raylet uses the path to default_worker.py on host.
+        # However, the path to default_worker.py inside the container
+        # can be different. We need the user to specify the path to
+        # default_worker.py inside the container.
+        if self.override_worker_entrypoint:
+            logger.debug(
+                f"Changing the worker entrypoint from {passthrough_args[0]} to "
+                f"{self.override_worker_entrypoint}."
+            )
+            passthrough_args[0] = self.override_worker_entrypoint
+        if sys.platform == "win32":
+            def quote(s):
+                s = s.replace("&", "%26")
+                return s
+            passthrough_args = [quote(s) for s in passthrough_args]
+            cmd = [*self.command_prefix, *executable, *passthrough_args]
+            logger.debug(f"Exec'ing worker with command: {cmd}")
+            subprocess.Popen(cmd, shell=True).wait()
+        else:
+            # We use shlex to do the necessary shell escape
+            # of special characters in passthrough_args.
+            passthrough_args = [shlex.quote(s) for s in passthrough_args]
+            cmd = [*self.command_prefix, *executable, *passthrough_args]
+            # TODO(SongGuyang): We add this env to command for macOS because it doesn't
+            # work for the C++ process of `os.execvp`. We should find a better way to
+            # fix it.
+            MACOS_LIBRARY_PATH_ENV_NAME = "DYLD_LIBRARY_PATH"
+            if MACOS_LIBRARY_PATH_ENV_NAME in os.environ:
+                cmd.insert(
+                    0,
+                    f"{MACOS_LIBRARY_PATH_ENV_NAME}="
+                    f"{os.environ[MACOS_LIBRARY_PATH_ENV_NAME]}",
+                )
+            logger.debug(f"Exec'ing worker with command: {cmd}")
+            # PyCharm will monkey patch the os.execvp at
+            # .pycharm_helpers/pydev/_pydev_bundle/pydev_monkey.py
+            # The monkey patched os.execvp function has a different
+            # signature. So, we use os.execvp("executable", args=[])
+            # instead of os.execvp(file="executable", args=[])
+            os.execvp("bash", args=["bash", "-c", " ".join(cmd)])

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/default_impl.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from ray._private.runtime_env.image_uri import ImageURIPlugin
+def get_image_uri_plugin_cls():
+    return ImageURIPlugin
+def get_protocols_provider():
+    from ray._private.runtime_env.protocol import ProtocolsProvider
+    return ProtocolsProvider

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/dependency_utils.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""Util functions to manage dependency requirements."""
+from typing import List, Tuple, Optional
+import os
+import tempfile
+import logging
+from contextlib import asynccontextmanager
+from ray._private.runtime_env import virtualenv_utils
+from ray._private.runtime_env.utils import check_output_cmd
+INTERNAL_PIP_FILENAME = "ray_runtime_env_internal_pip_requirements.txt"
+MAX_INTERNAL_PIP_FILENAME_TRIES = 100
+def gen_requirements_txt(requirements_file: str, pip_packages: List[str]):
+    """Dump [pip_packages] to the given [requirements_file] for later env setup."""
+    with open(requirements_file, "w") as file:
+        for line in pip_packages:
+            file.write(line + "\n")
+@asynccontextmanager
+async def check_ray(python: str, cwd: str, logger: logging.Logger):
+    """A context manager to check ray is not overwritten.
+    Currently, we only check ray version and path. It works for virtualenv,
+        - ray is in Python's site-packages.
+        - ray is overwritten during yield.
+        - ray is in virtualenv's site-packages.
+    """
+    async def _get_ray_version_and_path() -> Tuple[str, str]:
+        with tempfile.TemporaryDirectory(
+            prefix="check_ray_version_tempfile"
+        ) as tmp_dir:
+            ray_version_path = os.path.join(tmp_dir, "ray_version.txt")
+            check_ray_cmd = [
+                python,
+                "-c",
+                """
+import ray
+with open(r"{ray_version_path}", "wt") as f:
+    f.write(ray.__version__)
+    f.write(" ")
+    f.write(ray.__path__[0])
+                """.format(
+                    ray_version_path=ray_version_path
+                ),
+            ]
+            if virtualenv_utils._WIN32:
+                env = os.environ.copy()
+            else:
+                env = {}
+            output = await check_output_cmd(
+                check_ray_cmd, logger=logger, cwd=cwd, env=env
+            )
+            logger.info(f"try to write ray version information in: {ray_version_path}")
+            with open(ray_version_path, "rt") as f:
+                output = f.read()
+            # print after import ray may have [0m endings, so we strip them by *_
+            ray_version, ray_path, *_ = [s.strip() for s in output.split()]
+        return ray_version, ray_path
+    version, path = await _get_ray_version_and_path()
+    yield
+    actual_version, actual_path = await _get_ray_version_and_path()
+    if actual_version != version or actual_path != path:
+        raise RuntimeError(
+            "Changing the ray version is not allowed: \n"
+            f"  current version: {actual_version}, "
+            f"current path: {actual_path}\n"
+            f"  expect version: {version}, "
+            f"expect path: {path}\n"
+            "Please ensure the dependencies in the runtime_env pip field "
+            "do not install a different version of Ray."
+        )
+def get_requirements_file(target_dir: str, pip_list: Optional[List[str]]) -> str:
+    """Returns the path to the requirements file to use for this runtime env.
+    If pip_list is not None, we will check if the internal pip filename is in any of
+    the entries of pip_list. If so, we will append numbers to the end of the
+    filename until we find one that doesn't conflict. This prevents infinite
+    recursion if the user specifies the internal pip filename in their pip list.
+    Args:
+        target_dir: The directory to store the requirements file in.
+        pip_list: A list of pip requirements specified by the user.
+    Returns:
+        The path to the requirements file to use for this runtime env.
+    """
+    def filename_in_pip_list(filename: str) -> bool:
+        for pip_entry in pip_list:
+            if filename in pip_entry:
+                return True
+        return False
+    filename = INTERNAL_PIP_FILENAME
+    if pip_list is not None:
+        i = 1
+        while filename_in_pip_list(filename) and i < MAX_INTERNAL_PIP_FILENAME_TRIES:
+            filename = f"{INTERNAL_PIP_FILENAME}.{i}"
+            i += 1
+        if i == MAX_INTERNAL_PIP_FILENAME_TRIES:
+            raise RuntimeError(
+                "Could not find a valid filename for the internal "
+                "pip requirements file. Please specify a different "
+                "pip list in your runtime env."
+            )
+    return os.path.join(target_dir, filename)

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/image_uri.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import logging
+import os
+from typing import List, Optional
+from ray._private.runtime_env.context import RuntimeEnvContext
+from ray._private.runtime_env.plugin import RuntimeEnvPlugin
+from ray._private.runtime_env.utils import check_output_cmd
+default_logger = logging.getLogger(__name__)
+async def _create_impl(image_uri: str, logger: logging.Logger):
+    # Pull image if it doesn't exist
+    # Also get path to `default_worker.py` inside the image.
+    pull_image_cmd = [
+        "podman",
+        "run",
+        "--rm",
+        image_uri,
+        "python",
+        "-c",
+        (
+            "import ray._private.workers.default_worker as default_worker; "
+            "print(default_worker.__file__)"
+        ),
+    ]
+    logger.info("Pulling image %s", image_uri)
+    worker_path = await check_output_cmd(pull_image_cmd, logger=logger)
+    return worker_path.strip()
+def _modify_context_impl(
+    image_uri: str,
+    worker_path: str,
+    run_options: Optional[List[str]],
+    context: RuntimeEnvContext,
+    logger: logging.Logger,
+    ray_tmp_dir: str,
+):
+    context.override_worker_entrypoint = worker_path
+    container_driver = "podman"
+    container_command = [
+        container_driver,
+        "run",
+        "-v",
+        ray_tmp_dir + ":" + ray_tmp_dir,
+        "--cgroup-manager=cgroupfs",
+        "--network=host",
+        "--pid=host",
+        "--ipc=host",
+        # NOTE(zcin): Mounted volumes in rootless containers are
+        # owned by the user `root`. The user on host (which will
+        # usually be `ray` if this is being run in a ray docker
+        # image) who started the container is mapped using user
+        # namespaces to the user `root` in a rootless container. In
+        # order for the Ray Python worker to access the mounted ray
+        # tmp dir, we need to use keep-id mode which maps the user
+        # as itself (instead of as `root`) into the container.
+        # https://www.redhat.com/sysadmin/rootless-podman-user-namespace-modes
+        "--userns=keep-id",
+    ]
+    # Environment variables to set in container
+    env_vars = dict()
+    # Propagate all host environment variables that have the prefix "RAY_"
+    # This should include RAY_RAYLET_PID
+    for env_var_name, env_var_value in os.environ.items():
+        if env_var_name.startswith("RAY_"):
+            env_vars[env_var_name] = env_var_value
+    # Support for runtime_env['env_vars']
+    env_vars.update(context.env_vars)
+    # Set environment variables
+    for env_var_name, env_var_value in env_vars.items():
+        container_command.append("--env")
+        container_command.append(f"{env_var_name}='{env_var_value}'")
+    # The RAY_JOB_ID environment variable is needed for the default worker.
+    # It won't be set at the time setup() is called, but it will be set
+    # when worker command is executed, so we use RAY_JOB_ID=$RAY_JOB_ID
+    # for the container start command
+    container_command.append("--env")
+    container_command.append("RAY_JOB_ID=$RAY_JOB_ID")
+    if run_options:
+        container_command.extend(run_options)
+    # TODO(chenk008): add resource limit
+    container_command.append("--entrypoint")
+    container_command.append("python")
+    container_command.append(image_uri)
+    # Example:
+    # podman run -v /tmp/ray:/tmp/ray
+    # --cgroup-manager=cgroupfs --network=host --pid=host --ipc=host
+    # --userns=keep-id --env RAY_RAYLET_PID=23478 --env RAY_JOB_ID=$RAY_JOB_ID
+    # --entrypoint python rayproject/ray:nightly-py39
+    container_command_str = " ".join(container_command)
+    logger.info(f"Starting worker in container with prefix {container_command_str}")
+    context.py_executable = container_command_str
+class ImageURIPlugin(RuntimeEnvPlugin):
+    """Starts worker in a container of a custom image."""
+    name = "image_uri"
+    @staticmethod
+    def get_compatible_keys():
+        return {"image_uri", "config", "env_vars"}
+    def __init__(self, ray_tmp_dir: str):
+        self._ray_tmp_dir = ray_tmp_dir
+    async def create(
+        self,
+        uri: Optional[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: logging.Logger,
+    ) -> float:
+        if not runtime_env.image_uri():
+            return
+        self.worker_path = await _create_impl(runtime_env.image_uri(), logger)
+    def modify_context(
+        self,
+        uris: List[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,
+    ):
+        if not runtime_env.image_uri():
+            return
+        _modify_context_impl(
+            runtime_env.image_uri(),
+            self.worker_path,
+            [],
+            context,
+            logger,
+            self._ray_tmp_dir,
+        )
+class ContainerPlugin(RuntimeEnvPlugin):
+    """Starts worker in container."""
+    name = "container"
+    def __init__(self, ray_tmp_dir: str):
+        self._ray_tmp_dir = ray_tmp_dir
+    async def create(
+        self,
+        uri: Optional[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: logging.Logger,
+    ) -> float:
+        if not runtime_env.has_py_container() or not runtime_env.py_container_image():
+            return
+        self.worker_path = await _create_impl(runtime_env.py_container_image(), logger)
+    def modify_context(
+        self,
+        uris: List[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,
+    ):
+        if not runtime_env.has_py_container() or not runtime_env.py_container_image():
+            return
+        if runtime_env.py_container_worker_path():
+            logger.warning(
+                "You are using `container.worker_path`, but the path to "
+                "`default_worker.py` is now automatically detected from the image. "
+                "`container.worker_path` is deprecated and will be removed in future "
+                "versions."
+            )
+        _modify_context_impl(
+            runtime_env.py_container_image(),
+            runtime_env.py_container_worker_path() or self.worker_path,
+            runtime_env.py_container_run_options(),
+            context,
+            logger,
+            self._ray_tmp_dir,
+        )

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/java_jars.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import logging
+import os
+from typing import Dict, List, Optional
+from ray._private.gcs_utils import GcsAioClient
+from ray._private.runtime_env.context import RuntimeEnvContext
+from ray._private.runtime_env.packaging import (
+    delete_package,
+    download_and_unpack_package,
+    get_local_dir_from_uri,
+    is_jar_uri,
+)
+from ray._private.runtime_env.plugin import RuntimeEnvPlugin
+from ray._private.utils import get_directory_size_bytes, try_to_create_directory
+from ray.exceptions import RuntimeEnvSetupError
+default_logger = logging.getLogger(__name__)
+class JavaJarsPlugin(RuntimeEnvPlugin):
+    name = "java_jars"
+    def __init__(self, resources_dir: str, gcs_aio_client: GcsAioClient):
+        self._resources_dir = os.path.join(resources_dir, "java_jars_files")
+        self._gcs_aio_client = gcs_aio_client
+        try_to_create_directory(self._resources_dir)
+    def _get_local_dir_from_uri(self, uri: str):
+        return get_local_dir_from_uri(uri, self._resources_dir)
+    def delete_uri(
+        self, uri: str, logger: Optional[logging.Logger] = default_logger
+    ) -> int:
+        """Delete URI and return the number of bytes deleted."""
+        local_dir = get_local_dir_from_uri(uri, self._resources_dir)
+        local_dir_size = get_directory_size_bytes(local_dir)
+        deleted = delete_package(uri, self._resources_dir)
+        if not deleted:
+            logger.warning(f"Tried to delete nonexistent URI: {uri}.")
+            return 0
+        return local_dir_size
+    def get_uris(self, runtime_env: dict) -> List[str]:
+        return runtime_env.java_jars()
+    async def _download_jars(
+        self, uri: str, logger: Optional[logging.Logger] = default_logger
+    ):
+        """Download a jar URI."""
+        try:
+            jar_file = await download_and_unpack_package(
+                uri, self._resources_dir, self._gcs_aio_client, logger=logger
+            )
+        except Exception as e:
+            raise RuntimeEnvSetupError(
+                "Failed to download jar file: {}".format(e)
+            ) from e
+        module_dir = self._get_local_dir_from_uri(uri)
+        logger.debug(f"Succeeded to download jar file {jar_file} .")
+        return module_dir
+    async def create(
+        self,
+        uri: str,
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,
+    ) -> int:
+        if not uri:
+            return 0
+        if is_jar_uri(uri):
+            module_dir = await self._download_jars(uri=uri, logger=logger)
+        else:
+            try:
+                module_dir = await download_and_unpack_package(
+                    uri, self._resources_dir, self._gcs_aio_client, logger=logger
+                )
+            except Exception as e:
+                raise RuntimeEnvSetupError(
+                    "Failed to download jar file: {}".format(e)
+                ) from e
+        return get_directory_size_bytes(module_dir)
+    def modify_context(
+        self,
+        uris: List[str],
+        runtime_env_dict: Dict,
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,
+    ):
+        for uri in uris:
+            module_dir = self._get_local_dir_from_uri(uri)
+            if not module_dir.exists():
+                raise ValueError(
+                    f"Local directory {module_dir} for URI {uri} does "
+                    "not exist on the cluster. Something may have gone wrong while "
+                    "downloading, unpacking or installing the java jar files."
+                )
+            context.java_jars.append(str(module_dir))

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/mpi.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import logging
+import os
+from typing import List, Optional
+from ray._private.runtime_env.context import RuntimeEnvContext
+from ray._private.runtime_env.plugin import RuntimeEnvPlugin
+import subprocess
+default_logger = logging.getLogger(__name__)
+def mpi_init():
+    """Initialize the MPI cluster. When using MPI cluster, this must be called first."""
+    if hasattr(mpi_init, "inited"):
+        assert mpi_init.inited is True
+        return
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    if rank == 0:
+        from ray._private.accelerators import get_all_accelerator_managers
+        device_vars = [
+            m.get_visible_accelerator_ids_env_var()
+            for m in get_all_accelerator_managers()
+        ]
+        visible_devices = {
+            n: os.environ.get(n) for n in device_vars if os.environ.get(n)
+        }
+        comm.bcast(visible_devices)
+        with open(f"/tmp/{os.getpid()}.{rank}", "w") as f:
+            f.write(str(visible_devices))
+    else:
+        visible_devices = comm.bcast(None)
+        os.environ.update(visible_devices)
+    mpi_init.inited = True
+class MPIPlugin(RuntimeEnvPlugin):
+    """This plugin enable a MPI cluster to run on top of ray.
+    To use this, "mpi" need to be added to the runtime env like following
+    @ray.remote(
+        runtime_env={
+            "mpi": {
+                "args": ["-n", "4"],
+                "worker_entry": worker_entry,
+            }
+        }
+    )
+    def calc_pi():
+      ...
+    Here worker_entry should be function for the MPI worker to run.
+    For example, it should be `'py_module.worker_func'`. The module should be able to
+    be imported in the runtime.
+    In the mpi worker with rank==0, it'll be the normal ray function or actor.
+    For the worker with rank > 0, it'll just run `worker_func`.
+    ray.runtime_env.mpi_init must be called in the ray actors/tasks before any MPI
+    communication.
+    """
+    priority = 90
+    name = "mpi"
+    def modify_context(
+        self,
+        uris: List[str],  # noqa: ARG002
+        runtime_env: "RuntimeEnv",  # noqa: F821 ARG002
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,  # noqa: ARG002
+    ) -> None:
+        mpi_config = runtime_env.mpi()
+        if mpi_config is None:
+            return
+        try:
+            proc = subprocess.run(
+                ["mpirun", "--version"], capture_output=True, check=True
+            )
+        except subprocess.CalledProcessError:
+            logger.exception(
+                "Failed to run mpi run. Please make sure mpi has been installed"
+            )
+            # The worker will fail to run and exception will be thrown in runtime
+            # env agent.
+            raise
+        logger.info(f"Running MPI plugin\n {proc.stdout.decode()}")
+        # worker_entry should be a file either in the working dir
+        # or visible inside the cluster.
+        worker_entry = mpi_config.get("worker_entry")
+        assert (
+            worker_entry is not None
+        ), "`worker_entry` must be setup in the runtime env."
+        cmds = (
+            ["mpirun"]
+            + mpi_config.get("args", [])
+            + [
+                context.py_executable,
+                "-m",
+                "ray._private.runtime_env.mpi_runner",
+                worker_entry,
+            ]
+        )
+        # Construct the start cmd
+        context.py_executable = " ".join(cmds)

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/mpi_runner.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import sys
+import argparse
+import importlib
+from mpi4py import MPI
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Setup MPI worker")
+    parser.add_argument("worker_entry")
+    parser.add_argument("main_entry")
+    args, remaining_args = parser.parse_known_args()
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    if rank == 0:
+        entry_file = args.main_entry
+        sys.argv[1:] = remaining_args
+        spec = importlib.util.spec_from_file_location("__main__", entry_file)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+    else:
+        from ray.runtime_env import mpi_init
+        mpi_init()
+        module, func = args.worker_entry.rsplit(".", 1)
+        m = importlib.import_module(module)
+        f = getattr(m, func)
+        f()