Spaces:

littlebird13
/

qwen1.5_1.8B

Runtime error

App Files Files Community

ljy266987 commited on Apr 18, 2024

Commit

679081c

1 Parent(s): a826f18

add modified zero

Browse files

Files changed (16) hide show

.idea/.gitignore +8 -0
spaces/__init__.py +32 -0
spaces/config.py +29 -0
spaces/gradio.py +55 -0
spaces/utils.py +73 -0
spaces/zero/__init__.py +12 -0
spaces/zero/api.py +163 -0
spaces/zero/bitsandbytes.py +135 -0
spaces/zero/client.py +176 -0
spaces/zero/decorator.py +117 -0
spaces/zero/gradio.py +108 -0
spaces/zero/torch.py +279 -0
spaces/zero/tqdm.py +14 -0
spaces/zero/types.py +44 -0
spaces/zero/utils.py +44 -0
spaces/zero/wrappers.py +347 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

spaces/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+"""
+import sys
+if sys.version_info.minor < 8: # pragma: no cover
+    raise RuntimeError("Importing PySpaces requires Python 3.8+")
+from .zero.decorator import GPU
+from .zero.torch import disable_cuda_intercept
+from .gradio import gradio_auto_wrap
+from .gradio import disable_gradio_auto_wrap
+from .gradio import enable_gradio_auto_wrap
+import os
+# 获取全部环境变量
+env_vars = os.environ
+# 遍历并打印环境变量
+for key, value in env_vars.items():
+    print(f"{key}: {value}")
+__all__ = [
+    'GPU',
+    'disable_cuda_intercept',
+    'gradio_auto_wrap',
+    'disable_gradio_auto_wrap',
+    'enable_gradio_auto_wrap',
+]

spaces/config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+"""
+from __future__ import annotations
+import os
+from .utils import boolean
+class Settings:
+    def __init__(self):
+        self.zero_gpu = boolean(
+            os.getenv('SPACES_ZERO_GPU'))
+        self.zero_device_api_url = (
+            os.getenv('SPACES_ZERO_DEVICE_API_URL'))
+        self.gradio_auto_wrap = boolean(
+            os.getenv('SPACES_GRADIO_AUTO_WRAP'))
+        self.zero_patch_torch_device = boolean(
+            os.getenv('ZERO_GPU_PATCH_TORCH_DEVICE'))
+Config = Settings()
+if Config.zero_gpu:
+    assert Config.zero_device_api_url is not None, (
+        'SPACES_ZERO_DEVICE_API_URL env must be set '
+        'on ZeroGPU Spaces (identified by SPACES_ZERO_GPU=true)'
+    )

spaces/gradio.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+"""
+from __future__ import annotations
+from typing import Callable
+from typing import Generator
+from typing import TypeVar
+from typing import overload
+from typing_extensions import ParamSpec
+from .config import Config
+from .zero.decorator import GPU
+Param = ParamSpec('Param')
+Res = TypeVar('Res')
+gradio_auto_wrap_enabled = Config.gradio_auto_wrap
+def disable_gradio_auto_wrap():
+    global gradio_auto_wrap_enabled
+    gradio_auto_wrap_enabled = False
+def enable_gradio_auto_wrap():
+    global gradio_auto_wrap_enabled
+    gradio_auto_wrap_enabled = True
+@overload
+def gradio_auto_wrap(
+    task:
+     Callable[Param, Res],
+) -> Callable[Param, Res]:
+    ...
+@overload
+def gradio_auto_wrap(
+    task:
+     None,
+) -> None:
+    ...
+def gradio_auto_wrap(
+    task:
+      Callable[Param, Res]
+    | None,
+) -> (Callable[Param, Res]
+    | None):
+    """
+    """
+    if not gradio_auto_wrap_enabled:
+        return task
+    if not callable(task):
+        return task
+    return GPU(task) # type: ignore

spaces/utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+"""
+from __future__ import annotations
+import sys
+from functools import lru_cache as cache
+from functools import partial
+import multiprocessing
+from multiprocessing.queues import SimpleQueue as _SimpleQueue
+from pathlib import Path
+from pickle import PicklingError
+from typing import Callable
+from typing import TypeVar
+GRADIO_VERSION_ERROR_MESSAGE = "Make sure Gradio version is at least 3.46"
+T = TypeVar('T')
+@cache
+def self_cgroup_device_path() -> str:
+    cgroup_content = Path('/proc/self/cgroup').read_text()
+    for line in cgroup_content.strip().split('\n'):
+        contents = line.split(':devices:')
+        if len(contents) != 2:
+            continue # pragma: no cover
+        return contents[1]
+    raise Exception # pragma: no cover
+if sys.version_info.minor < 9: # pragma: no cover
+    _SimpleQueue.__class_getitem__ = classmethod(lambda cls, _: cls) # type: ignore
+class SimpleQueue(_SimpleQueue[T]):
+    def __init__(self, *args):
+        super().__init__(*args, ctx=multiprocessing.get_context('fork'))
+    def put(self, obj: T):
+        try:
+            super().put(obj)
+        except PicklingError:
+            raise # pragma: no cover
+        # https://bugs.python.org/issue29187
+        except Exception as e:
+            message = str(e)
+            if not "pickle" in message:
+                raise # pragma: no cover
+            raise PicklingError(message)
+    def close(self): # Python 3.8 static typing trick
+        super().close() # type: ignore
+def drop_params(fn: Callable[[], T]) -> Callable[..., T]:
+    def drop(*args):
+        return fn()
+    return drop
+def boolean(value: str | None) -> bool:
+    return value is not None and value.lower() in ("1", "t", "true")
+def gradio_request_var():
+    try:
+        from gradio.context import LocalContext
+    except ImportError: # pragma: no cover
+        raise RuntimeError(GRADIO_VERSION_ERROR_MESSAGE)
+    return LocalContext.request
+debug = partial(print, 'SPACES_ZERO_GPU_DEBUG')

spaces/zero/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+"""
+from ..config import Config
+from . import torch
+if Config.zero_gpu:
+    if torch.is_in_bad_fork():
+        raise RuntimeError(
+            "CUDA has been initialized before importing the `spaces` package"
+        )
+    torch.patch() # pragma: no cover

spaces/zero/api.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Synced with huggingface/pyspaces:spaces/zero/api.py
+"""
+from __future__ import annotations
+from datetime import timedelta
+from typing import Any
+from typing import Generator
+from typing import Literal
+from typing import NamedTuple
+from typing import Optional
+from typing import overload
+import httpx
+from pydantic import BaseModel
+from typing_extensions import assert_never
+AllowToken = str
+NvidiaIndex = int # TODO: Migrate to GpuIndex (less confusing for MIG)
+NvidiaUUID = str
+CGroupPath = str
+VisitorId = str
+Score = float
+class ScheduleResponse(BaseModel):
+    idle: bool
+    nvidiaIndex: int
+    nvidiaUUID: str
+    allowToken: str | None
+class QuotaInfos(BaseModel):
+    left: int
+    wait: timedelta
+class ReportUsageMonitoringParams(NamedTuple):
+    nvidia_index: int
+    visitor_id: str
+    duration: timedelta
+class QueueEvent(BaseModel):
+    event: Literal['ping', 'failed', 'succeeded']
+    data: Optional[ScheduleResponse] = None
+def sse_parse(text: str):
+    event, *data = text.strip().splitlines()
+    assert event.startswith('event:')
+    event = event[6:].strip()
+    if event in ('ping', 'failed'):
+        return QueueEvent(event=event)
+    assert event == 'succeeded'
+    (data,) = data
+    assert data.startswith('data:')
+    data = data[5:].strip()
+    return QueueEvent(event=event, data=ScheduleResponse.parse_raw(data))
+def sse_stream(res: httpx.Response) -> Generator[QueueEvent, Any, None]:
+    for text in res.iter_text():
+        if len(text) == 0:
+            break # pragma: no cover
+        try:
+            print(f"sse_stream: {text}")
+            yield sse_parse(text)
+        except GeneratorExit:
+            res.close()
+            break
+class APIClient:
+    def __init__(self, client: httpx.Client):
+        self.client = client
+    def startup_report(self) -> httpx.codes:
+        res = self.client.post('/startup-report')
+        print(f"/startup-report: {res}")
+        return httpx.codes(res.status_code)
+    def schedule(
+        self,
+        cgroup_path: str,
+        task_id: int = 0,
+        token: str | None = None,
+        duration_seconds: int | None = None,
+        enable_queue: bool = True,
+    ):
+        params: dict[str, str | int | bool] = {
+            'cgroupPath': cgroup_path,
+            'taskId': task_id,
+            'enableQueue': enable_queue,
+        }
+        if duration_seconds is not None:
+            params['durationSeconds'] = duration_seconds
+        if token is not None:
+            params['token'] = token
+        print(f"POST /schedule: {params}")
+        res = self.client.send(
+            request=self.client.build_request(
+                method='POST',
+                url='/schedule',
+                params=params,
+            ),
+            stream=True,
+        )
+        status = httpx.codes(res.status_code)
+        if (status is not httpx.codes.OK and
+            status is not httpx.codes.TOO_MANY_REQUESTS
+        ):
+            res.close()
+            return status
+        if "text/event-stream" in res.headers['content-type']:
+            return sse_stream(res)
+        res.read()
+        print(f"POST /schedule res: {res.json()}")
+        if status is httpx.codes.TOO_MANY_REQUESTS:
+            return QuotaInfos(**res.json()) # pragma: no cover
+        if status is httpx.codes.OK:
+            return ScheduleResponse(**res.json())
+        assert_never(status)
+    def allow(
+        self,
+        allow_token: str,
+        pid: int,
+    ):
+        params = {
+            'allowToken': allow_token,
+            'pid': pid,
+        }
+        res = self.client.post('/allow', params=params)
+        print(f"POST /allow param: {params} res: {res}")
+        return httpx.codes(res.status_code)
+    def release(
+        self,
+        nvidia_index: int,
+        cgroup_path: str,
+        task_id: int = 0,
+        fail: bool = False,
+    ) -> httpx.codes:
+        params = {
+            'nvidiaIndex': nvidia_index,
+            'cgroupPath': cgroup_path,
+            'taskId': task_id,
+            'fail': fail,
+        }
+        res = self.client.post('/release', params=params)
+        print(f"POST /release param: {params} res: {res}")
+        return httpx.codes(res.status_code)
+    def get_queue_size(self) -> int:
+        res = self.client.get('/queue-size')
+        assert res.status_code == 200, res.status_code
+        size = res.json()
+        assert isinstance(size, int)
+        return size

spaces/zero/bitsandbytes.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+"""
+# pyright: reportPrivateImportUsage=false
+from __future__ import annotations
+import importlib
+from typing import TYPE_CHECKING
+from typing import Tuple
+from .utils import cuda_unavailable
+from .utils import maybe_import_torch
+from .utils import maybe_import_bitsandbytes
+if TYPE_CHECKING:
+    import torch as Torch
+if (torch := maybe_import_torch()) and (bnb := maybe_import_bitsandbytes()):
+    from torch.utils.weak import WeakTensorKeyDictionary
+    with cuda_unavailable(torch):
+        from bitsandbytes import cextension
+        from bitsandbytes import functional
+        try: # bitsandbytes < 0.44
+            from bitsandbytes.cuda_setup.main import CUDASetup
+        except ModuleNotFoundError: # pragma: no cover
+            CUDASetup = None
+        from bitsandbytes.nn import Int8Params
+        from bitsandbytes.nn import Params4bit
+    _param_to_8bit   = Int8Params.to     # type: ignore
+    _param_cuda_8bit = Int8Params.cuda
+    _param_to_4bit   = Params4bit.to     # type: ignore
+    _param_cuda_4bit = Params4bit.cuda
+    TensorToArgs = Tuple[torch.device, torch.dtype, bool, torch.memory_format]
+    to_ops_8bit: dict[Int8Params, TensorToArgs | None] = WeakTensorKeyDictionary() # type: ignore
+    to_ops_4bit: dict[Params4bit, TensorToArgs | None] = WeakTensorKeyDictionary() # type: ignore
+    def _to_op_register_8bit(self: Int8Params, *args, **kwargs):
+        parsed = torch._C._nn._parse_to(*args, **kwargs)
+        device, *_ = parsed
+        if not isinstance(device, torch.device): # pragma: no cover
+            return _param_to_8bit(self, *args, **kwargs)
+        if device.type != 'cuda':
+            return _param_to_8bit(self, *args, **kwargs)
+        to_ops_8bit[self] = parsed
+        return self
+    def _to_op_register_4bit(self: Params4bit, *args, **kwargs):
+        parsed = torch._C._nn._parse_to(*args, **kwargs)
+        device, *_ = parsed
+        if not isinstance(device, torch.device): # pragma: no cover
+            return _param_to_4bit(self, *args, **kwargs)
+        if device.type != 'cuda':
+            return _param_to_4bit(self, *args, **kwargs)
+        to_ops_4bit[self] = parsed
+        return self
+    def _cuda_op_arg_check(device: Torch.device | int | str | None) -> bool:
+        if device is None: # pragma: no cover
+            return True
+        if isinstance(device, int):
+            return True
+        if isinstance(device, str): # pragma: no cover
+            device = torch.device(device)
+        return device.type == 'cuda' # pragma: no cover
+    def _cuda_op_register_8bit(self: Int8Params, device: Torch.device | int | str | None = None, **kwargs):
+        if not _cuda_op_arg_check(device): # pragma: no cover
+            # Let PyTorch handle the fail
+            return _param_cuda_8bit(self, device, **kwargs)
+        to_ops_8bit[self] = None
+        return self
+    def _cuda_op_register_4bit(self: Params4bit, device: Torch.device | int | str | None = None, **kwargs):
+        if not _cuda_op_arg_check(device): # pragma: no cover
+            # Let PyTorch handle the fail
+            return _param_cuda_4bit(self, device, **kwargs)
+        to_ops_4bit[self] = None
+        return self
+    def _patch():
+        Int8Params.to   = _to_op_register_8bit   # type: ignore
+        Int8Params.cuda = _cuda_op_register_8bit # type: ignore
+        Params4bit.to   = _to_op_register_4bit   # type: ignore
+        Params4bit.cuda = _cuda_op_register_4bit # type: ignore
+    def _unpatch():
+        Int8Params.to   = _param_to_8bit   # type: ignore
+        Int8Params.cuda = _param_cuda_8bit
+        Params4bit.to   = _param_to_4bit   # type: ignore
+        Params4bit.cuda = _param_cuda_4bit
+    def _move():
+        if CUDASetup is not None:
+            CUDASetup._instance = None
+        importlib.reload(cextension)
+        functional.lib = cextension.lib
+        for op in to_ops_8bit.items():
+            tensor, parsed_args = op
+            if parsed_args:
+                _, dtype, _, memory_format = parsed_args
+            else:
+                dtype, memory_format = None, None
+            tensor.data = _param_to_8bit(tensor,
+                device='cuda',
+                dtype=dtype,
+                memory_format=memory_format,
+            ) # type: ignore
+        for op in to_ops_4bit.items():
+            tensor, parsed_args = op
+            if parsed_args:
+                _, dtype, _, memory_format = parsed_args
+            else:
+                dtype, memory_format = None, None
+            tensor.data = _param_to_4bit(tensor,
+                device='cuda',
+                dtype=dtype,
+                memory_format=memory_format,
+            ) # type: ignore
+else:
+    _patch = lambda: None
+    _unpatch = lambda: None
+    _move = lambda: None
+patch = _patch
+unpatch = _unpatch
+move = _move

spaces/zero/client.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+"""
+from __future__ import annotations
+import os
+import time
+import warnings
+from datetime import timedelta
+import gradio as gr
+import httpx
+from .. import utils
+from ..config import Config
+from .api import APIClient
+from .api import QuotaInfos
+from .api import ScheduleResponse
+from .gradio import get_event
+TOKEN_HEADER = 'X-IP-Token'
+DEFAULT_SCHEDULE_DURATION = 60
+QUOTA_MESSAGE = "You have exceeded your GPU quota"
+UNUSED_MESSAGE = "GPU device not used"
+NO_GPU_MESSAGE_REGULAR = "No GPU is currently available"
+NO_GPU_MESSAGE_INQUEUE = "No GPU is currently available for you after 60s"
+def api_client():
+    assert Config.zero_device_api_url is not None
+    httpx_client = httpx.Client(base_url=Config.zero_device_api_url, timeout=60, verify=False)
+    print(f"api_client: {Config.zero_device_api_url}")
+    return APIClient(httpx_client)
+def startup_report():
+    retries, max_retries = 0, 2
+    client = api_client()
+    while (status := client.startup_report()) is httpx.codes.NOT_FOUND: # pragma: no cover
+        time.sleep(1)
+        if (retries := retries + 1) > max_retries:
+            raise RuntimeError("Error while initializing ZeroGPU: NotFound")
+    if status is not httpx.codes.OK: # pragma: no cover
+        raise RuntimeError("Error while initializing ZeroGPU: Unknown")
+def schedule(
+    task_id: int,
+    request: gr.Request | None = None,
+    duration: timedelta | None = None,
+    _first_attempt: bool = True,
+) -> ScheduleResponse:
+    if not gr.__version__.startswith('4.'): # pragma: no cover
+        raise RuntimeError("ZeroGPU is only compatible with Gradio 4+")
+    res = api_client().schedule(
+        cgroup_path=utils.self_cgroup_device_path(),
+        task_id=task_id,
+        token=_get_token(request),
+        duration_seconds=duration.seconds if duration is not None else None,
+    )
+    if isinstance(res, ScheduleResponse):
+        return res
+    if isinstance(res, QuotaInfos): # pragma: no cover
+        requested = duration.seconds if duration is not None else DEFAULT_SCHEDULE_DURATION
+        if res.wait < timedelta(0):
+            message = (
+                f"The requested GPU duration ({requested}s) "
+                f"is larger than the maximum allowed"
+            )
+        else:
+            message = (
+                f"You have exceeded your GPU quota "
+                f"({res.left}s left vs. {requested}s requested). "
+                f"Please retry in {res.wait}"
+            )
+        raise gr.Error(message)
+    if not isinstance(res, httpx.codes): # pragma: no cover
+        gr.Info("Waiting for a GPU to become available")
+        connection_event = get_event()
+        if connection_event is None and request is not None:
+            warnings.warn("ZeroGPU: Cannot get Gradio app Queue instance")
+        while True:
+            try:
+                event = next(res)
+            except StopIteration:
+                raise RuntimeError("Unexpected end of stream")
+            except httpx.RemoteProtocolError:
+                if not _first_attempt:
+                    raise RuntimeError("Error while re-trying after queue disconnect")
+                return schedule(task_id, request, duration, _first_attempt=False)
+            if event.event == 'ping':
+                if connection_event is not None and not connection_event.alive:
+                    res.close()
+                    raise RuntimeError("Connection closed by visitor while queueing")
+                continue
+            if event.event == 'failed':
+                raise gr.Error(NO_GPU_MESSAGE_INQUEUE)
+            if event.event == 'succeeded':
+                assert event.data is not None
+                if connection_event is not None and not connection_event.alive:
+                    release(task_id, event.data.nvidiaIndex)
+                    raise RuntimeError("Connection closed by visitor on queue success")
+                gr.Info("Successfully acquired a GPU")
+                return event.data
+    if res is httpx.codes.SERVICE_UNAVAILABLE:
+        raise gr.Error(NO_GPU_MESSAGE_REGULAR)
+    # TODO: Find a way to log 'detail' response field
+    raise RuntimeError(f"ZeroGPU API /schedule error: {res} ({httpx.codes.get_reason_phrase(res)})") # pragma: no cover
+def allow(allow_token: str) -> None:
+    pid = os.getpid()
+    assert pid != 1, "Allowing PID 1 on ZeroGPU will end up killing your Space"
+    assert api_client().allow(allow_token=allow_token, pid=pid) is httpx.codes.OK
+def release(
+    task_id: int,
+    nvidia_index: int,
+    fail: bool = False,
+    allow_404: bool = False,
+) -> None:
+    res = api_client().release(
+        cgroup_path=utils.self_cgroup_device_path(),
+        task_id=task_id,
+        nvidia_index=nvidia_index,
+        fail=fail,
+    )
+    if res is httpx.codes.NO_CONTENT: # pragma: no cover
+        try:
+            gr.Warning(UNUSED_MESSAGE)
+        except AttributeError:
+            pass
+        warnings.warn(UNUSED_MESSAGE, RuntimeWarning)
+        return None
+    if res is httpx.codes.NOT_FOUND:
+        if not allow_404:
+            warnings.warn("ZeroGPU API /release warning: 404 Not Found")
+        return None
+    if httpx.codes.is_success(res):
+        return None
+    # TODO: Find a way to log 'detail' response field
+    # TODO: Only raise in dev environment. Simply warn in production ?
+    raise RuntimeError(f"ZeroGPU API /release error: {res} ({httpx.codes.get_reason_phrase(res)})") # pragma: no cover
+def _get_token(request: gr.Request | None) -> str | None:
+    if request is None:
+        return None
+    headers = getattr(request, 'headers', None)
+    if headers is None or not hasattr(headers, '__dict__'):
+        raise gr.Error("Internal Gradio error")
+    # Compatibility trick
+    if not hasattr(headers, 'get'):
+        headers = headers.__dict__ # pragma: no cover
+    if not (token := headers.get(TOKEN_HEADER.lower())):
+        raise gr.Error("Internal infra error")
+    return token

spaces/zero/decorator.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+"""
+from __future__ import annotations
+import inspect
+import sys
+import warnings
+from datetime import timedelta
+from functools import partial
+from typing import Callable
+from typing import TypeVar
+from typing import overload
+from typing_extensions import ParamSpec
+from typing_extensions import Unpack
+import gradio as gr
+from ..config import Config
+from . import client
+from .types import EmptyKwargs
+from .wrappers import regular_function_wrapper
+from .wrappers import generator_function_wrapper
+P = ParamSpec('P')
+R = TypeVar('R')
+decorated_cache: dict[Callable, Callable] = {}
+@overload
+def GPU(
+    task: None = None, *,
+    duration: int | timedelta | None = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    ...
+@overload
+def GPU(
+    task: Callable[P, R], *,
+    duration: int | timedelta | None = None,
+) -> Callable[P, R]:
+    ...
+def GPU(
+    task: Callable[P, R] | None = None, *,
+    duration: int | timedelta | None = None,
+    **kwargs: Unpack[EmptyKwargs],
+) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]:
+    """
+    ZeroGPU decorator
+    Basic usage:
+        ```
+        @spaces.GPU
+        def fn(...):
+            # CUDA is available here
+            pass
+        ```
+    With custom duration:
+        ```
+        @spaces.GPU(duration=45) # Expressed in seconds
+        def fn(...):
+            # CUDA is available here
+            pass
+        ```
+    Args:
+        task (`Callable | None`): Python function that requires CUDA
+        duration (`int | datetime.timedelta`): Estimated duration in seconds or `datetime.timedelta`
+    Returns:
+        `Callable`: GPU-ready function
+    """
+    if "enable_queue" in kwargs:
+        warnings.warn("`enable_queue` parameter is now ignored and always set to `True`")
+    if task is None:
+        return partial(_GPU, duration=duration)
+    return _GPU(task, duration)
+def _GPU(
+    task: Callable[P, R],
+    duration: int | timedelta | None,
+) -> Callable[P, R]:
+    if not Config.zero_gpu:
+        # TODO: still prepend gr.Request for type consistency ?
+        return task # type: ignore
+    if sys.version_info.minor < 9: # pragma: no cover
+        raise RuntimeError("Actually using @spaces.GPU on a ZeroGPU Space requires Python 3.9+")
+    if task in decorated_cache:
+        # TODO: Assert same duration ?
+        return decorated_cache[task] # type: ignore
+    if inspect.iscoroutinefunction(task):
+        raise NotImplementedError
+    if duration is None or isinstance(duration, timedelta):
+        timedelta_duration = duration
+    else:
+        timedelta_duration = timedelta(seconds=duration)
+    if inspect.isgeneratorfunction(task):
+        decorated = generator_function_wrapper(task, timedelta_duration)
+    else:
+        decorated = regular_function_wrapper(task, timedelta_duration)
+    client.startup_report()
+    decorated_cache.update({
+        task:      decorated,
+        decorated: decorated,
+    })
+    return decorated # type: ignore

spaces/zero/gradio.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+"""
+from __future__ import annotations
+from typing import NamedTuple
+import warnings
+from gradio.context import LocalContext
+from gradio.helpers import Progress
+from gradio.helpers import TrackedIterable
+from gradio.queueing import Queue
+from typing_extensions import assert_type
+from ..utils import SimpleQueue
+from .types import GeneratorResQueueResult
+from .types import GradioQueueEvent
+from .types import RegularResQueueResult
+QUEUE_RPC_METHODS = [
+    "set_progress",
+    "log_message",
+]
+class GradioPartialContext(NamedTuple):
+    event_id: str | None
+    in_event_listener: bool
+    progress: Progress | None
+    @staticmethod
+    def get():
+        TrackedIterable.__reduce__ = tracked_iterable__reduce__
+        return GradioPartialContext(
+            event_id=LocalContext.event_id.get(),
+            in_event_listener=LocalContext.in_event_listener.get(),
+            progress=LocalContext.progress.get(),
+        )
+    @staticmethod
+    def apply(context: 'GradioPartialContext'):
+        LocalContext.event_id.set(context.event_id)
+        LocalContext.in_event_listener.set(context.in_event_listener)
+        LocalContext.progress.set(context.progress)
+def get_queue_instance():
+    blocks = LocalContext.blocks.get()
+    if blocks is None: # pragma: no cover
+        return None
+    return blocks._queue
+def get_event():
+    queue = get_queue_instance()
+    event_id = LocalContext.event_id.get()
+    if queue is None:
+        return None
+    if event_id is None: # pragma: no cover
+        return None
+    for job in queue.active_jobs:
+        if job is None: # pragma: no cover
+            continue
+        for event in job:
+            if event._id == event_id:
+                return event
+def try_process_queue_event(method_name: str, *args, **kwargs):
+    queue = get_queue_instance()
+    if queue is None: # pragma: no cover
+        warnings.warn("ZeroGPU: Cannot get Gradio app Queue instance")
+        return
+    method = getattr(queue, method_name, None)
+    assert callable(method)
+    method(*args, **kwargs)
+def patch_gradio_queue(
+    res_queue: SimpleQueue[RegularResQueueResult | None] | SimpleQueue[GeneratorResQueueResult | None],
+):
+    def rpc_method(method_name: str):
+        def method(*args, **kwargs):
+            if args and isinstance(args[0], Queue):
+                args = args[1:] # drop `self`
+            res_queue.put(GradioQueueEvent(method_name, args, kwargs))
+        return method
+    for method_name in QUEUE_RPC_METHODS:
+        if (method := getattr(Queue, method_name, None)) is None: # pragma: no cover
+            warnings.warn(f"ZeroGPU: Gradio Queue has no {method_name} attribute")
+            continue
+        if not callable(method): # pragma: no cover
+            warnings.warn(f"ZeroGPU: Gradio Queue {method_name} is not callable")
+            continue
+        setattr(Queue, method_name, rpc_method(method_name))
+    TrackedIterable.__reduce__ = tracked_iterable__reduce__
+def tracked_iterable__reduce__(self):
+    res: tuple = super(TrackedIterable, self).__reduce__() # type: ignore
+    cls, base, state, *_ = res
+    return cls, base,{**state, **{
+        'iterable': None,
+        '_tqdm': None,
+    }}

spaces/zero/torch.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+"""
+# pyright: reportPrivateImportUsage=false
+from __future__ import annotations
+import multiprocessing
+import os
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import suppress
+from functools import partial
+from types import SimpleNamespace
+from typing import TYPE_CHECKING
+from typing import Any
+from typing import Optional
+from typing import Tuple
+from ..config import Config
+from . import bitsandbytes
+from .utils import maybe_import_torch
+if TYPE_CHECKING:
+    import torch as Torch
+# Nvidia A100.80G MIG (drivers 535) / Torch 2.2.0
+CUDA_DEVICE_NAME = 'NVIDIA A100-SXM4-80GB MIG 3g.40gb'
+CUDA_TOTAL_MEMORY = 42144366592
+CUDA_MEM_GET_INFO = (41911451648, CUDA_TOTAL_MEMORY)
+CUDA_DEVICE_CAPABILITY = (8, 0)
+CUDA_DEVICE_PROPERTIES = SimpleNamespace(name=CUDA_DEVICE_NAME, major=8, minor=0, total_memory=CUDA_TOTAL_MEMORY, multi_processor_count=42)
+GENERIC_METHOD_NAMES = [
+    'arange',
+    'as_tensor',
+    'asarray',
+    'bartlett_window',
+    'blackman_window',
+    'empty',
+    'empty_like',
+    'empty_strided',
+    'eye',
+    'full',
+    'full_like',
+    'hamming_window',
+    'hann_window',
+    'kaiser_window',
+    'linspace',
+    'logspace',
+    'obj',
+    'ones',
+    'ones_like',
+    'rand',
+    'rand_like',
+    'randint',
+    'randint_like',
+    'randn',
+    'randn_like',
+    'randperm',
+    'range',
+    'sparse_bsc_tensor',
+    'sparse_bsr_tensor',
+    'sparse_compressed_tensor',
+    'sparse_coo_tensor',
+    'sparse_csc_tensor',
+    'sparse_csr_tensor',
+    'tensor',
+    'tril_indices',
+    'triu_indices',
+    'zeros',
+    'zeros_like',
+]
+if (torch := maybe_import_torch()):
+    from torch.utils.weak import WeakTensorKeyDictionary
+    TO_CUDA = (torch.device('cuda'), None, False, None)
+    _tensor__deepcopy__ = torch.Tensor.__deepcopy__
+    _tensor_to         = torch.Tensor.to
+    _tensor_cuda       = torch.Tensor.cuda
+    _tensor_cpu        = torch.Tensor.cpu
+    _torch_generics    = {name: getattr(torch, name) for name in GENERIC_METHOD_NAMES}
+    _cuda_init         = torch._C._cuda_init
+    _cuda_available      = torch.cuda.is_available
+    _cuda_device_count   = torch.cuda.device_count
+    _cuda_current_device = torch.cuda.current_device
+    _cuda_mem_get_info   = torch.cuda.mem_get_info
+    _cuda_get_device_capability   = torch.cuda.get_device_capability
+    _cuda_get_device_properties   = torch.cuda.get_device_properties
+    _cuda_get_device_name         = torch.cuda.get_device_name
+    TensorToArgs = Tuple[Optional[torch.device], Optional[torch.dtype], bool, Optional[torch.memory_format]]
+    to_ops: dict[Torch.Tensor, TensorToArgs] = WeakTensorKeyDictionary() # type: ignore
+    def _tensor_new_register(*args, **kwargs):
+        new_tensor: Torch.Tensor = torch._C._TensorBase.__new__(*args, **kwargs)
+        if (base_tensor := new_tensor._base) is not None:
+            if base_tensor in to_ops:
+                to_ops[new_tensor] = to_ops[base_tensor]
+        return new_tensor
+    def _tensor_deepcopy_register(self: Torch.Tensor, memo):
+        new_tensor = _tensor__deepcopy__(self, memo)
+        if isinstance(new_tensor, torch.Tensor):
+            if self in to_ops:
+                to_ops[new_tensor] = to_ops[self]
+        return new_tensor
+    @property
+    def _tensor_device_property(self: Torch.Tensor):
+        if self in to_ops:
+            return torch.device(type='cuda', index=0)
+        del torch.Tensor.device
+        try:
+            return self.device
+        finally:
+            torch.Tensor.device = _tensor_device_property # type: ignore
+    @property
+    def _tensor_dtype_property(self: Torch.Tensor):
+        if self in to_ops:
+            if (to_dtype := to_ops[self][1]) is not None:
+                return to_dtype
+        del torch.Tensor.dtype
+        try:
+            return self.dtype
+        finally:
+            torch.Tensor.dtype = _tensor_dtype_property # type: ignore
+    def _to_op_register(self: Torch.Tensor, *args, **kwargs):
+        parsed = torch._C._nn._parse_to(*args, **kwargs)
+        device, dtype, *_ = parsed
+        try:
+            to_args = to_ops.pop(self)
+        except KeyError:
+            to_args = None
+        if device is None:
+            if to_args is not None:
+                to_ops[self] = (to_args[0], dtype, *to_args[2:])
+                return self
+            return _tensor_to(self, *args, **kwargs)
+        if device.type != 'cuda':
+            if to_args is not None:
+                if (to_dtype := to_args[1]) is not None:
+                    kwargs = {'dtype': to_dtype, **kwargs}
+            return _tensor_to(self, *args, **kwargs)
+        to_ops[self] = parsed
+        return self
+    def _cuda_op_arg_check(device: Torch.device | int | str | None) -> bool:
+        if device is None:
+            return True
+        if isinstance(device, int):
+            return True
+        if isinstance(device, str):
+            device = torch.device(device)
+        return device.type == 'cuda'
+    def _cuda_op_register(self: Torch.Tensor, device: Torch.device | int | str | None = None, **kwargs):
+        if not _cuda_op_arg_check(device):
+            # Let PyTorch handle the fail
+            return _tensor_cuda(self, device, **kwargs)
+        to_ops[self] = TO_CUDA
+        return self
+    def _cpu_op_remove(self: Torch.Tensor, **kwargs):
+        try:
+            to_args = to_ops.pop(self)
+        except KeyError:
+            to_args = None
+        if to_args is not None:
+            if (to_dtype := to_args[1]) is not None:
+                return _tensor_to(self, 'cpu', **{'dtype': to_dtype, **kwargs})
+        return _tensor_cpu(self, **kwargs)
+    def _cuda_init_raise():
+        raise RuntimeError(
+            "CUDA must not be initialized in the main process "
+            "on Spaces with Stateless GPU environment.\n"
+            "You can look at this Stacktrace to find out "
+            "which part of your code triggered a CUDA init"
+        )
+    def _generic_method_register(name: str, *args: Any, **kwargs: Any):
+        try:
+            device = torch.device(kwargs.get('device', "cpu"))
+        except Exception:
+            return _torch_generics[name](*args, **kwargs)
+        if device.type != 'cuda':
+            return _torch_generics[name](*args, **kwargs)
+        tensor = _torch_generics[name](*args, **{**kwargs, 'device': "cpu"})
+        to_ops[tensor] = TO_CUDA
+        return tensor
+    def _patch():
+        torch.Tensor.__deepcopy__ = _tensor_deepcopy_register
+        torch.Tensor.__new__      = _tensor_new_register # pyright: ignore [reportAttributeAccessIssue]
+        torch.Tensor.to         = _to_op_register   # type: ignore
+        torch.Tensor.cuda       = _cuda_op_register # type: ignore
+        torch.Tensor.cpu        = _cpu_op_remove # type: ignore
+        if Config.zero_patch_torch_device:
+            torch.Tensor.device = _tensor_device_property # type: ignore
+            torch.Tensor.dtype  = _tensor_dtype_property # pyright: ignore [reportAttributeAccessIssue]
+        for name in GENERIC_METHOD_NAMES:
+            setattr(torch, name, partial(_generic_method_register, name))
+        torch._C._cuda_init     = _cuda_init_raise
+        torch.cuda.is_available   = lambda: True
+        torch.cuda.device_count   = lambda: 1
+        torch.cuda.current_device = lambda: 0
+        torch.cuda.mem_get_info   = lambda *args, **kwargs: CUDA_MEM_GET_INFO
+        torch.cuda.get_device_capability = lambda *args, **kwargs: CUDA_DEVICE_CAPABILITY
+        torch.cuda.get_device_properties = lambda *args, **kwargs: CUDA_DEVICE_PROPERTIES
+        torch.cuda.get_device_name       = lambda *args, **kwargs: CUDA_DEVICE_NAME
+        bitsandbytes.patch()
+    def _unpatch():
+        torch.Tensor.__deepcopy__ = _tensor__deepcopy__
+        with suppress(AttributeError):
+            del torch.Tensor.__new__
+        torch.Tensor.to         = _tensor_to
+        torch.Tensor.cuda       = _tensor_cuda
+        torch.Tensor.cpu        = _tensor_cpu
+        with suppress(AttributeError):
+            del torch.Tensor.device
+        with suppress(AttributeError):
+            del torch.Tensor.dtype
+        for name in GENERIC_METHOD_NAMES:
+            setattr(torch, name, _torch_generics[name])
+        torch._C._cuda_init     = _cuda_init
+        torch.cuda.is_available   = _cuda_available
+        torch.cuda.device_count   = _cuda_device_count
+        torch.cuda.current_device = _cuda_current_device
+        torch.cuda.mem_get_info   = _cuda_mem_get_info
+        torch.cuda.get_device_capability = _cuda_get_device_capability
+        torch.cuda.get_device_properties = _cuda_get_device_properties
+        torch.cuda.get_device_name       = _cuda_get_device_name
+        bitsandbytes.unpatch()
+    def _move(nvidia_uuid: str):
+        os.environ['CUDA_VISIBLE_DEVICES'] = nvidia_uuid
+        torch.Tensor([0]).cuda() # CUDA init
+        for op in to_ops.items():
+            tensor, parsed_args = op
+            _, dtype, _, memory_format = parsed_args
+            tensor.data = _tensor_to(tensor,
+                device='cuda',
+                dtype=dtype,
+                memory_format=memory_format,
+            ) # type: ignore
+        bitsandbytes.move()
+        torch.cuda.synchronize()
+    def _is_in_bad_fork():
+        with ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as e:
+            f = e.submit(torch.cuda._is_in_bad_fork)
+            return f.result()
+    def _disable_cuda_intercept():
+        torch.Tensor.to   = _tensor_to
+        torch.Tensor.cuda = _tensor_cuda
+else:
+    _patch = lambda: None
+    _unpatch = lambda: None
+    _move = lambda nvidia_uuid: None
+    _is_in_bad_fork = lambda: False
+    _disable_cuda_intercept = lambda: None
+patch = _patch
+unpatch = _unpatch
+move = _move
+is_in_bad_fork = _is_in_bad_fork
+disable_cuda_intercept = _disable_cuda_intercept

spaces/zero/tqdm.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+"""
+from multiprocessing.synchronize import RLock as MultiprocessingRLock
+def remove_tqdm_multiprocessing_lock():
+    from tqdm import tqdm
+    tqdm_lock = tqdm.get_lock()
+    assert tqdm_lock.__class__.__name__ == 'TqdmDefaultWriteLock'
+    tqdm_lock.locks = [
+        lock for lock in tqdm_lock.locks
+        if not isinstance(lock, MultiprocessingRLock)
+    ]

spaces/zero/types.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+from typing import Dict
+from typing import Tuple
+from typing import TypedDict
+from typing_extensions import Generic
+from typing_extensions import ParamSpec
+from typing_extensions import TypeAlias
+from typing_extensions import TypeVar
+Params = Tuple[Tuple[object, ...], Dict[str, Any]]
+Res = TypeVar('Res')
+Param = ParamSpec('Param')
+class EmptyKwargs(TypedDict):
+    pass
+@dataclass
+class OkResult(Generic[Res]):
+    value: Res
+@dataclass
+class ExceptionResult:
+    value: Exception
+@dataclass
+class AbortedResult:
+    pass
+@dataclass
+class EndResult:
+    pass
+@dataclass
+class GradioQueueEvent:
+    method_name: str
+    args: tuple[Any, ...]
+    kwargs: dict[str, Any]
+RegularResQueueResult:   TypeAlias = "OkResult[Res] | ExceptionResult | GradioQueueEvent"
+GeneratorResQueueResult: TypeAlias = "OkResult[Res] | ExceptionResult | EndResult | GradioQueueEvent"
+YieldQueueResult:        TypeAlias = "OkResult[Res] | ExceptionResult | EndResult | AbortedResult"

spaces/zero/utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+"""
+from __future__ import annotations
+from contextlib import contextmanager
+from importlib import metadata
+from types import ModuleType
+from packaging import version
+from ..config import Config
+def maybe_import_torch():
+    if not Config.zero_gpu:
+        return None
+    try:
+        import torch
+    except ImportError:
+        return None
+    return torch
+@contextmanager
+def cuda_unavailable(torch: ModuleType):
+    _is_available = torch.cuda.is_available
+    torch.cuda.is_available = lambda: False
+    yield
+    torch.cuda.is_available = _is_available
+def maybe_import_bitsandbytes():
+    if (torch := maybe_import_torch()) is None:
+        return None # pragma: no cover
+    with cuda_unavailable(torch):
+        try:
+            import bitsandbytes
+        except ImportError:
+            bitsandbytes = None
+        else:
+            if (bnb_version := version.parse(metadata.version('bitsandbytes'))) < version.parse('0.40.0'):
+                raise RuntimeError(f"ZeroGPU requires bitsandbytes >= 0.40.0 (installed: {bnb_version})") # pragma: no cover
+            print("↑ Those bitsandbytes warnings are expected on ZeroGPU ↑")
+    return bitsandbytes

spaces/zero/wrappers.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+"""
+from __future__ import annotations
+import multiprocessing
+import os
+import signal
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from contextvars import copy_context
+from datetime import timedelta
+from functools import partial
+from functools import wraps
+from multiprocessing.context import ForkProcess
+from pickle import PicklingError
+from queue import Empty
+from queue import Queue as ThreadQueue
+from threading import Thread
+from typing import TYPE_CHECKING
+from typing import Callable
+from typing import Generator
+from typing import Generic
+from typing_extensions import assert_never
+import gradio as gr
+import psutil
+from ..utils import debug
+from ..utils import drop_params
+from ..utils import gradio_request_var
+from ..utils import SimpleQueue as Queue
+from . import client
+from . import torch
+from .api import AllowToken
+from .api import NvidiaIndex
+from .api import NvidiaUUID
+from .gradio import GradioPartialContext
+from .gradio import patch_gradio_queue
+from .gradio import try_process_queue_event
+from .tqdm import remove_tqdm_multiprocessing_lock
+from .types import * # TODO: Please don't do that
+GENERATOR_GLOBAL_TIMEOUT = 20 * 60
+Process = multiprocessing.get_context('fork').Process
+forked = False
+class Worker(Generic[Res]):
+    process: ForkProcess
+    arg_queue: Queue[tuple[Params, GradioPartialContext]]
+    res_queue: Queue[Res | None]
+    _sentinel: Thread
+    def __init__(
+        self,
+        target: Callable[[
+            Queue[tuple[Params, GradioPartialContext]],
+            Queue[Res | None],
+            AllowToken | None,
+            NvidiaUUID,
+            list[int],
+        ], None],
+        allow_token: str | None,
+        nvidia_uuid: str,
+    ):
+        self._sentinel = Thread(target=self._close_on_exit)
+        self.arg_queue = Queue()
+        self.res_queue = Queue()
+        fds = [c.fd for c in psutil.Process().connections()]
+        args = self.arg_queue, self.res_queue, allow_token, nvidia_uuid, fds
+        if TYPE_CHECKING:
+            target(*args)
+        self.process = Process(
+            target=target,
+            args=args,
+            daemon=True,
+        )
+        self.process.start()
+        self._sentinel.start()
+    def _close_on_exit(self):
+        self.process.join()
+        self.res_queue.put(None)
+def worker_init(
+    res_queue: Queue[RegularResQueueResult | None] | Queue[GeneratorResQueueResult | None],
+    allow_token: str | None,
+    nvidia_uuid: str,
+    fds: list[int],
+) -> None | ExceptionResult:
+    try: # Unrecoverable init part
+        if allow_token is not None:
+            client.allow(allow_token)
+        torch.unpatch()
+        torch.move(nvidia_uuid)
+        patch_gradio_queue(res_queue)
+    except Exception as e: # pragma: no cover
+        traceback.print_exc()
+        return ExceptionResult(e)
+    try:
+        remove_tqdm_multiprocessing_lock()
+    except Exception: # pragma: no cover
+        print("Error while trying to remove tqdm mp_lock:")
+        traceback.print_exc()
+    for fd in fds:
+        try:
+            os.close(fd)
+        except Exception as e: # pragma: no cover
+            if isinstance(e, OSError) and e.errno == 9:
+                continue
+            traceback.print_exc()
+            return ExceptionResult(e)
+def regular_function_wrapper(
+    task: Callable[Param, Res],
+    duration: timedelta | None,
+) -> Callable[Param, Res]:
+    request_var = gradio_request_var()
+    workers: dict[NvidiaIndex, Worker[RegularResQueueResult[Res]]] = {}
+    task_id = id(task)
+    @wraps(task)
+    def gradio_handler(*args: Param.args, **kwargs: Param.kwargs) -> Res:
+        if forked:
+            return task(*args, **kwargs)
+        request = request_var.get()
+        schedule_response = client.schedule(task_id=task_id, request=request, duration=duration)
+        allow_token = schedule_response.allowToken
+        nvidia_index = schedule_response.nvidiaIndex
+        nvidia_uuid = schedule_response.nvidiaUUID
+        release = partial(client.release, task_id=task_id, nvidia_index=nvidia_index)
+        worker = workers.get(nvidia_index)
+        if worker is None or not worker.process.is_alive():
+            worker = Worker(thread_wrapper, allow_token, nvidia_uuid)
+            workers[nvidia_index] = worker
+        try:
+            worker.arg_queue.put(((args, kwargs), GradioPartialContext.get()))
+        except PicklingError:
+            release(fail=True)
+            # TODO: Better error message (check what arg / kwarg is problematic ?)
+            raise
+        while True:
+            res = worker.res_queue.get()
+            if res is None:
+                release(fail=True, allow_404=True)
+                raise gr.Error("GPU task aborted")
+            if isinstance(res, ExceptionResult):
+                release(fail=True)
+                raise res.value
+            if isinstance(res, OkResult):
+                release()
+                return res.value
+            if isinstance(res, GradioQueueEvent):
+                try_process_queue_event(res.method_name, *res.args, **res.kwargs)
+                continue
+            assert_never(res)
+    def thread_wrapper(
+        arg_queue: Queue[tuple[Params, GradioPartialContext]],
+        res_queue: Queue[RegularResQueueResult[Res] | None],
+        allow_token: str | None,
+        nvidia_uuid: str,
+        fds: list[int],
+    ):
+        global forked
+        forked = True
+        if (res := worker_init(
+            res_queue=res_queue,
+            allow_token=allow_token,
+            nvidia_uuid=nvidia_uuid,
+            fds=fds,
+        )) is not None: # pragma: no cover
+            res_queue.put(res)
+            return
+        signal.signal(signal.SIGTERM, drop_params(arg_queue.close))
+        while True:
+            try:
+                (args, kwargs), gradio_context = arg_queue.get()
+            except OSError:
+                break
+            GradioPartialContext.apply(gradio_context)
+            context = copy_context()
+            with ThreadPoolExecutor() as executor:
+                future = executor.submit(context.run, task, *args, **kwargs) # type: ignore
+            try:
+                res = future.result()
+            except Exception as e:
+                traceback.print_exc()
+                res = ExceptionResult(e)
+            else:
+                res = OkResult(res)
+            try:
+                res_queue.put(res)
+            except PicklingError as e:
+                res_queue.put(ExceptionResult(e))
+    # https://github.com/python/cpython/issues/91002
+    if not hasattr(task, '__annotations__'):
+        gradio_handler.__annotations__ = {}
+    return gradio_handler
+def generator_function_wrapper(
+    task: Callable[Param, Generator[Res, None, None]],
+    duration: timedelta | None,
+) -> Callable[Param, Generator[Res, None, None]]:
+    request_var = gradio_request_var()
+    workers: dict[NvidiaIndex, Worker[GeneratorResQueueResult[Res]]] = {}
+    task_id = id(task)
+    @wraps(task)
+    def gradio_handler(*args: Param.args, **kwargs: Param.kwargs) -> Generator[Res, None, None]:
+        if forked:
+            yield from task(*args, **kwargs)
+            return
+        request = request_var.get()
+        schedule_response = client.schedule(task_id=task_id, request=request, duration=duration)
+        allow_token = schedule_response.allowToken
+        nvidia_index = schedule_response.nvidiaIndex
+        nvidia_uuid = schedule_response.nvidiaUUID
+        release = partial(client.release, task_id=task_id, nvidia_index=nvidia_index)
+        worker = workers.get(nvidia_index)
+        if worker is None or not worker.process.is_alive():
+            worker = Worker(thread_wrapper, allow_token, nvidia_uuid)
+            workers[nvidia_index] = worker
+        try:
+            worker.arg_queue.put(((args, kwargs), GradioPartialContext.get()))
+        except PicklingError:
+            release(fail=True)
+            raise
+        yield_queue: ThreadQueue[YieldQueueResult[Res]] = ThreadQueue()
+        def fill_yield_queue(worker: Worker[GeneratorResQueueResult[Res]]):
+            while True:
+                res = worker.res_queue.get()
+                if res is None:
+                    release(fail=True, allow_404=True)
+                    yield_queue.put(AbortedResult())
+                    return
+                if isinstance(res, ExceptionResult):
+                    release(fail=True)
+                    yield_queue.put(ExceptionResult(res.value))
+                    return
+                if isinstance(res, EndResult):
+                    release()
+                    yield_queue.put(EndResult())
+                    return
+                if isinstance(res, OkResult):
+                    yield_queue.put(OkResult(res.value))
+                    continue
+                if isinstance(res, GradioQueueEvent): # pragma: no cover (not working properly on Gradio side)
+                    try_process_queue_event(res.method_name, *res.args, **res.kwargs)
+                    continue
+                debug(f"fill_yield_queue: assert_never({res=})")
+                assert_never(res)
+        from typing_extensions import assert_never
+        with ThreadPoolExecutor() as e:
+            f = e.submit(fill_yield_queue, worker)
+            f.add_done_callback(lambda _: debug("fill_yield_queue DONE"))
+            while True:
+                try:
+                    res = yield_queue.get(timeout=GENERATOR_GLOBAL_TIMEOUT)
+                except Empty: # pragma: no cover
+                    debug(f"yield_queue TIMEOUT ({GENERATOR_GLOBAL_TIMEOUT=})")
+                    raise
+                if isinstance(res, AbortedResult):
+                    raise gr.Error("GPU task aborted")
+                if isinstance(res, ExceptionResult):
+                    raise res.value
+                if isinstance(res, EndResult):
+                    break
+                if isinstance(res, OkResult):
+                    yield res.value
+                    continue
+                debug(f"gradio_handler: assert_never({res=})")
+                assert_never(res)
+    def thread_wrapper(
+        arg_queue: Queue[tuple[Params, GradioPartialContext]],
+        res_queue: Queue[GeneratorResQueueResult[Res] | None],
+        allow_token: str | None,
+        nvidia_uuid: str,
+        fds: list[int],
+    ):
+        global forked
+        forked = True
+        if (res := worker_init(
+            res_queue=res_queue,
+            allow_token=allow_token,
+            nvidia_uuid=nvidia_uuid,
+            fds=fds,
+        )) is not None: # pragma: no cover
+            res_queue.put(res)
+            return
+        signal.signal(signal.SIGTERM, drop_params(arg_queue.close))
+        while True:
+            try:
+                (args, kwargs), gradio_context = arg_queue.get()
+            except OSError:
+                break
+            def iterate():
+                gen = task(*args, **kwargs) # type: ignore
+                while True:
+                    try:
+                        res = next(gen)
+                    except StopIteration:
+                        break
+                    except Exception as e:
+                        res_queue.put(ExceptionResult(e))
+                        break
+                    try:
+                        res_queue.put(OkResult(res))
+                    except PicklingError as e:
+                        res_queue.put(ExceptionResult(e))
+                        break
+                    else:
+                        continue
+            GradioPartialContext.apply(gradio_context)
+            context = copy_context()
+            with ThreadPoolExecutor() as executor:
+                executor.submit(context.run, iterate)
+            res_queue.put(EndResult())
+    # https://github.com/python/cpython/issues/91002
+    if not hasattr(task, '__annotations__'):
+        gradio_handler.__annotations__ = {}
+    return gradio_handler