Spaces:

Naphula
/

model_tools

Running

App Files Files Community

model_tools / graph_v18.py

Naphula

Upload graph_v18.py

8eacd9a verified 26 days ago

raw

history blame contribute delete

30.2 kB

	# graph_v18.py - Optimized for 3060 TI (8GB VRAM) and similar low-VRAM GPUs
	# Copyright (C) 2025 Arcee AI
	# SPDX-License-Identifier: LGPL-3.0-only
	"""
	Module for computational graph execution.

	Classes:
	Task: Abstract base class representing a computational task.
	Executor: Class for scheduling and executing directed acyclic task graphs.
	"""

	import os
	import sys
	import gc
	import logging
	import networkx
	import torch
	import tqdm
	from pydantic import BaseModel
	from typing_extensions import Generic, TypeVar
	from abc import ABC, abstractmethod
	from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union

	from mergekit.common import get_torch_accelerator_module

	# ============================================================================
	# CONFIGURATION SECTION - TUNE THESE PARAMETERS FOR YOUR GPU
	# ============================================================================

	# --- PRIMARY VRAM TARGETS ---
	# For 3060 TI (8GB): Start with 7.2-7.4GB. Increase if stable, decrease if OOM.
	# For 3060 (12GB): Try 10.5-11.0GB
	# For 4GB cards: Try 3.2-3.5GB
	TARGET_VRAM_GB = 7.7 # Target VRAM usage in GB (TUNE THIS FIRST)

	# Safety margin to account for PyTorch overhead and fragmentation
	# Windows typically needs ~0.8GB, Linux ~0.5GB
	VRAM_SAFETY_MARGIN_GB = 0.2 # Reduce to 0.5-0.6 on Linux, increase to 1.0 if unstable

	# --- CUDA MEMORY ALLOCATOR CONFIGURATION ---
	# Smaller values = less fragmentation but more overhead
	# 24MB is optimal for 8GB cards, 32MB for 12GB+ cards
	CUDA_MAX_SPLIT_SIZE_MB = 24 # Options: 16, 24, 32, 64

	# --- CHUNK SIZE BEHAVIOR ---
	# How aggressively to reduce chunk size on OOM (0.5-0.9 range)
	# Lower = more conservative (slower but safer), Higher = more aggressive
	CHUNK_REDUCTION_FACTOR = 0.75 # Options: 0.5 (safe), 0.7 (balanced), 0.85 (aggressive)

	# Minimum chunk size before giving up and falling back to CPU
	MIN_CHUNK_SIZE = 1 # Usually keep at 1, increase to 4-8 if seeing micro-chunk overhead

	# Enable power-of-2 alignment for chunk sizes (following measure.py strategy)
	# This improves memory allocation efficiency
	ENABLE_POWER_OF_2_ALIGNMENT = True # Set False if causing issues

	# --- TASK-SPECIFIC MEMORY MULTIPLIERS ---
	# These control how much extra VRAM to reserve for specific task types
	# Increase if task OOMs, decrease if underutilizing VRAM
	TASK_MULTIPLIERS = {
	"ModelStock": 2.2, # Options: 1.8-2.5 (needs room for pairwise similarities)
	"Karcher": 3.0, # Options: 2.5-3.5 (iterative, needs working memory)
	"Consensus": 3.0, # Options: 2.5-3.5 (similar to Karcher)
	"default": 1.2, # Options: 1.0-1.5 (general tasks)
	}

	# --- MEMORY CLEANUP BEHAVIOR ---
	# Enable aggressive garbage collection and cache clearing
	# True = slower but more stable, False = faster but may fragment memory
	ENABLE_AGGRESSIVE_CLEANUP = False # Set False if merges are very stable

	# How often to force cleanup (every N tasks). 0 = after every task
	CLEANUP_FREQUENCY = 10 # Options: 0 (always), 1, 2, 5, 10

	# --- FALLBACK STRATEGY ---
	# Fixed chunk sizes to try if adaptive chunking fails
	# Powers of 2 work best for GPU memory alignment
	FALLBACK_CHUNK_SIZES = [4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]

	# --- FAST PATH OPTIMIZATION ---
	# Try to execute entire task at once before chunking
	# True = faster when it works, False = always chunk (more conservative)
	ENABLE_FAST_PATH = True # Set False if getting frequent OOM on large tasks

	# --- TASK ROUTING ---
	# Tasks that should always run on CPU (typically I/O bound)
	CPU_ONLY_TASKS = [
	"LoadTensor",
	"GatherTensors",
	"SaveTensor",
	"TensorWriterTask",
	"FinalizeModel",
	"PermutedEmbeddings", # Gather operations don't benefit from GPU
	]

	# ============================================================================
	# END OF CONFIGURATION SECTION
	# ============================================================================

	if sys.platform == "win32":
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{CUDA_MAX_SPLIT_SIZE_MB}"

	ValueT = TypeVar("ValueT")
	LOG = logging.getLogger(__name__)


	def _round_to_power_of_2(n: int, prefer_lower: bool = True) -> int:
	"""Round to nearest power of 2 for memory alignment."""
	if n <= 0:
	return 1
	if n == 1:
	return 1

	# Find the two nearest powers of 2
	power = n.bit_length() - 1
	lower = 1 << power
	upper = 1 << (power + 1)

	if prefer_lower or (n - lower) < (upper - n):
	return lower
	return upper


	class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
	@abstractmethod
	def arguments(self) -> Dict[str, "Task"]:
	...

	@abstractmethod
	def execute(self, **kwargs) -> ValueT:
	...

	def priority(self) -> int:
	return 0

	def group_label(self) -> Optional[str]:
	return None

	def uses_accelerator(self) -> bool:
	return False

	def main_thread_only(self) -> bool:
	return False

	def duplicate_per_gpu(self) -> bool:
	return False


	class TaskUniverse:
	tasks: List[Task]
	task_to_index: Dict[Task, int]
	task_arguments: Dict[int, Dict[str, int]]
	_type_id_to_index: Dict[Tuple[type, int], int]

	def __init__(self, tasks: Optional[Iterable[Task]] = None):
	self.tasks = []
	self.task_to_index = {}
	self.task_arguments = {}
	self._type_id_to_index = {}
	if tasks is not None:
	for task in tasks:
	self.add_task(task)

	def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":
	_ti_key = (type(task), id(task))
	if _ti_key in self._type_id_to_index:
	index = self._type_id_to_index[_ti_key]
	return TaskHandle(self, index)

	index = self.task_to_index.setdefault(task, len(self.tasks))
	if index < len(self.tasks):
	return TaskHandle(self, index)
	self.tasks.append(task)
	self._type_id_to_index[_ti_key] = index

	if recursive:
	self.task_arguments[index] = {}
	for k, v in task.arguments().items():
	self.task_arguments[index][k] = self.add_task(v, recursive=True)._index
	return TaskHandle(self, index)

	def get_handle(self, task: Task) -> Optional["TaskHandle"]:
	if task not in self.task_to_index:
	return None
	return TaskHandle(self, self.task_to_index[task])


	class TaskHandle:
	__slots__ = ["_universe", "_index"]
	_universe: TaskUniverse
	_index: int

	def __init__(self, universe: TaskUniverse, index: int):
	self._universe = universe
	self._index = index

	def task(self) -> Task:
	return self._universe.tasks[self._index]

	def arguments(self) -> Dict[str, "TaskHandle"]:
	return {
	k: TaskHandle(self._universe, v)
	for k, v in self._universe.task_arguments[self._index].items()
	}

	def __eq__(self, other):
	if not isinstance(other, TaskHandle):
	return False
	return self._index == other._index and self._universe is other._universe

	def __hash__(self):
	return self._index

	def __str__(self):
	return f"TaskHandle({type(self.task()).__name__}, {self._index})"

	__repr__ = __str__


	class ExecutionSchedule:
	tasks: List[TaskHandle]
	last_use_index: Dict[TaskHandle, int]

	def __init__(self, tasks: List[TaskHandle], last_use_index: Dict[TaskHandle, int]):
	self.tasks = tasks
	self.last_use_index = last_use_index


	def build_schedule(
	targets: List[TaskHandle], cached_values: Dict[TaskHandle, Any]
	) -> ExecutionSchedule:
	if not targets:
	return ExecutionSchedule(tasks=[], last_use_index={})

	universe = targets[0]._universe
	dummy_handle = TaskHandle(universe, -1)
	edge_tups: List[Tuple[TaskHandle, TaskHandle]] = []

	explored = set()
	to_explore = set(targets)
	while to_explore:
	task = to_explore.pop()
	if task in explored:
	continue
	explored.add(task)
	if task in (cached_values or {}):
	continue
	for dep in task.arguments().values():
	to_explore.add(dep)
	edge_tups.append((dep, task))

	for target in targets:
	edge_tups.append((dummy_handle, target))

	def _compare_key(node: TaskHandle) -> Tuple[str, int]:
	if node._index < 0:
	return ("", 0)
	task = node.task()
	return (task.group_label() or "", -task.priority())

	graph = networkx.DiGraph(edge_tups)
	schedule: List[TaskHandle] = [
	node
	for node in networkx.lexicographical_topological_sort(graph, key=_compare_key)
	if (node != dummy_handle) and node not in (cached_values or {})
	]

	last_use_index = {}
	for idx, task in reversed(list(enumerate(schedule))):
	for dep in task.arguments().values():
	if dep not in last_use_index:
	last_use_index[dep] = idx
	if task not in last_use_index:
	last_use_index[task] = idx
	for task in cached_values or {}:
	if task not in last_use_index:
	last_use_index[task] = len(schedule) + 1

	return ExecutionSchedule(tasks=schedule, last_use_index=last_use_index)


	class Executor:
	math_device: torch.device
	storage_device: torch.device
	universe: TaskUniverse
	targets: List[TaskHandle]
	schedule: ExecutionSchedule
	cached_values: Optional[Dict[TaskHandle, Any]]
	_task_counter: int

	def __init__(
	self,
	targets: Union[List[Task], List[TaskHandle]],
	math_device: torch.device = torch.device("cpu"),
	storage_device: torch.device = torch.device("cpu"),
	cached_values: Optional[Dict[TaskHandle, Any]] = None,
	):
	self.cached_values = cached_values
	self._task_counter = 0

	if isinstance(math_device, str):
	math_device = torch.device(math_device)
	if isinstance(storage_device, str):
	storage_device = torch.device(storage_device)
	self.math_device = math_device
	self.storage_device = storage_device

	if targets and isinstance(targets[0], Task):
	universe = TaskUniverse(targets)
	targets = [universe.add_task(t) for t in targets]
	elif targets and isinstance(targets[0], TaskHandle):
	universe = targets[0]._universe
	elif not targets:
	universe = TaskUniverse()
	else:
	raise ValueError("Targets must be a list of Task or TaskHandle instances")

	self.universe = universe
	self.targets = targets
	self.schedule = build_schedule(targets, cached_values=cached_values)

	def _slice_argument(self, arg: Any, start: int, end: int) -> Any:
	"""Recursively slice tensors within nested structures."""
	if isinstance(arg, torch.Tensor):
	if arg.shape[0] > 1:
	return arg[start:end]
	return arg
	elif isinstance(arg, dict):
	return {k: self._slice_argument(v, start, end) for k, v in arg.items()}
	elif isinstance(arg, list):
	return [self._slice_argument(v, start, end) for v in arg]
	elif isinstance(arg, tuple):
	return tuple(self._slice_argument(v, start, end) for v in arg)
	return arg

	def _get_memory_stats(self) -> Dict[str, float]:
	"""Get current VRAM statistics in GB."""
	if self.math_device.type != "cuda":
	return {}

	allocated = torch.cuda.memory_allocated(self.math_device) / (1024**3)
	reserved = torch.cuda.memory_reserved(self.math_device) / (1024**3)
	total = torch.cuda.get_device_properties(self.math_device).total_memory / (1024**3)

	return {
	"allocated_gb": allocated,
	"reserved_gb": reserved,
	"total_gb": total,
	"free_gb": total - allocated,
	}

	def _get_adaptive_chunk_size(self, task: Task, arguments: Dict[str, Any]) -> int:
	"""
	Calculate optimal chunk size based on available VRAM and task requirements.

	This implements the "measure.py strategy" of targeting a specific VRAM fill level
	rather than using currently available memory, which prevents oscillation.
	"""
	if self.math_device.type == "cpu":
	return 1024 # Large default for CPU

	# Get hardware capacity
	total_vram = torch.cuda.get_device_properties(self.math_device).total_memory
	target_bytes = TARGET_VRAM_GB * (1024**3)

	# Analyze tensor dimensions and count
	num_tensors = 0
	width = 0
	bytes_per_element = 4 # Default float32

	for arg in arguments.values():
	if isinstance(arg, torch.Tensor):
	num_tensors += 1
	width = max(width, arg.shape[-1] if len(arg.shape) > 1 else arg.shape[0])
	bytes_per_element = arg.element_size()
	elif isinstance(arg, dict):
	for v in arg.values():
	if isinstance(v, torch.Tensor):
	num_tensors += 1
	width = max(width, v.shape[-1] if len(v.shape) > 1 else v.shape[0])
	bytes_per_element = v.element_size()

	if num_tensors == 0 or width == 0:
	return 512 # Safe default

	# Get task-specific multiplier
	task_name = type(task).__name__
	multiplier = TASK_MULTIPLIERS.get("default", 1.2)

	for key, mult in TASK_MULTIPLIERS.items():
	if key in task_name:
	multiplier = mult
	break

	# Calculate bytes per row with multiplier for working memory
	bytes_per_row = num_tensors * width * bytes_per_element * multiplier

	# Calculate usable VRAM (target minus current allocation and safety margin)
	current_allocated = torch.cuda.memory_allocated(self.math_device)
	safety_bytes = VRAM_SAFETY_MARGIN_GB * (1024**3)
	usable_vram = max(target_bytes - current_allocated - safety_bytes, 1024 * (1024**2))

	# Calculate chunk size
	chunk_size = max(MIN_CHUNK_SIZE, int(usable_vram // bytes_per_row))

	# Apply power-of-2 alignment if enabled (measure.py strategy)
	if ENABLE_POWER_OF_2_ALIGNMENT and chunk_size > MIN_CHUNK_SIZE:
	chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)

	LOG.debug(f"Calculated chunk size: {chunk_size} (tensors={num_tensors}, width={width}, mult={multiplier:.2f})")
	return chunk_size

	def _execute_chunked(self, task: Task, arguments: Dict[str, Any]) -> Any:
	"""
	Execute task in chunks with progressive fallback strategy.

	Strategy:
	1. Try adaptive chunk size
	2. On OOM, reduce by CHUNK_REDUCTION_FACTOR
	3. Continue until success or MIN_CHUNK_SIZE reached
	"""
	# Find total rows to process
	total_rows = 0
	for arg in arguments.values():
	if isinstance(arg, torch.Tensor):
	total_rows = arg.shape[0]
	break
	elif isinstance(arg, dict):
	for v in arg.values():
	if isinstance(v, torch.Tensor):
	total_rows = v.shape[0]
	break
	if total_rows > 0:
	break

	if total_rows == 0:
	return task.execute(**arguments)

	# Calculate initial chunk size
	chunk_size = self._get_adaptive_chunk_size(task, arguments)

	# FAST PATH: Try to execute all at once if chunk size >= total rows
	if ENABLE_FAST_PATH and chunk_size >= total_rows:
	try:
	gpu_args = {
	k: self._move_tensors(v, self.math_device)
	for k, v in arguments.items()
	}
	res = task.execute(**gpu_args)
	result = self._move_tensors(res, self.storage_device)
	del gpu_args, res
	if ENABLE_AGGRESSIVE_CLEANUP:
	torch.cuda.empty_cache()
	return result
	except torch.OutOfMemoryError:
	LOG.warning(f"Fast path OOM, falling back to chunking")
	torch.cuda.empty_cache()
	gc.collect()
	chunk_size = max(MIN_CHUNK_SIZE, total_rows // 2)

	# Chunked execution with progressive reduction
	results = []
	i = 0
	oom_count = 0

	while i < total_rows:
	end = min(i + chunk_size, total_rows)

	try:
	chunk_args_gpu = {
	k: self._move_tensors(self._slice_argument(v, i, end), self.math_device)
	for k, v in arguments.items()
	}
	chunk_res = task.execute(**chunk_args_gpu)
	results.append(self._move_tensors(chunk_res, self.storage_device))

	del chunk_args_gpu, chunk_res

	# Aggressive cleanup per measure.py strategy
	if ENABLE_AGGRESSIVE_CLEANUP:
	torch.cuda.empty_cache()

	i = end # Move to next chunk
	oom_count = 0 # Reset OOM counter on success

	except torch.OutOfMemoryError:
	oom_count += 1
	torch.cuda.empty_cache()
	gc.collect()

	# Progressive reduction
	old_chunk = chunk_size
	chunk_size = max(MIN_CHUNK_SIZE, int(chunk_size * CHUNK_REDUCTION_FACTOR))

	# Apply power-of-2 alignment
	if ENABLE_POWER_OF_2_ALIGNMENT:
	chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)

	if chunk_size < MIN_CHUNK_SIZE:
	LOG.error(f"Chunk size below minimum ({MIN_CHUNK_SIZE}), cannot continue")
	raise

	LOG.warning(
	f"OOM at chunk {old_chunk}, reducing to {chunk_size} "
	f"(attempt {oom_count}, progress: {i}/{total_rows})"
	)

	# Safety: if we OOM too many times, something is wrong
	if oom_count > 10:
	LOG.error("Too many OOM errors, giving up")
	raise

	# Concatenate results
	if not results:
	return None

	if isinstance(results[0], torch.Tensor):
	return torch.cat(results, dim=0)
	elif isinstance(results[0], dict):
	out = {}
	for k in results[0].keys():
	out[k] = torch.cat([r[k] for r in results], dim=0)
	return out

	return results

	def _execute_with_fallback(self, task: Task, arguments: Dict[str, Any], accelerator) -> Any:
	"""
	Execute task with comprehensive fallback strategy.

	Strategy:
	1. Try full GPU execution
	2. Try adaptive chunking
	3. Try fixed chunk sizes
	4. Fall back to CPU
	"""
	task_name = type(task).__name__

	# Strategy 1: Try full GPU execution for light tasks
	try:
	gpu_args = {
	k: self._move_tensors(v, self.math_device)
	for k, v in arguments.items()
	}
	res = task.execute(**gpu_args)
	result = self._move_tensors(res, self.storage_device)
	del gpu_args, res
	return result
	except torch.OutOfMemoryError:
	LOG.debug(f"Full GPU execution failed for {task_name}, trying chunked")
	torch.cuda.empty_cache()
	gc.collect()
	except Exception as e:
	LOG.warning(f"GPU execution error for {task_name}: {e}")
	torch.cuda.empty_cache()
	raise

	# Strategy 2: Try adaptive chunking
	try:
	result = self._execute_chunked(task, arguments)
	return result
	except torch.OutOfMemoryError:
	LOG.warning(f"Adaptive chunking failed for {task_name}, trying fixed sizes")
	torch.cuda.empty_cache()
	gc.collect()
	except Exception as e:
	LOG.warning(f"Chunking error for {task_name}: {e}")
	raise

	# Strategy 3: Try fixed chunk sizes
	for chunk_size in FALLBACK_CHUNK_SIZES:
	if chunk_size < MIN_CHUNK_SIZE:
	continue

	try:
	LOG.info(f"Trying fixed chunk size {chunk_size} for {task_name}")

	# Get total rows
	total_rows = 0
	for arg in arguments.values():
	if isinstance(arg, torch.Tensor):
	total_rows = arg.shape[0]
	break
	elif isinstance(arg, dict):
	for v in arg.values():
	if isinstance(v, torch.Tensor):
	total_rows = v.shape[0]
	break
	if total_rows > 0:
	break

	if total_rows == 0:
	break

	results = []
	for i in range(0, total_rows, chunk_size):
	end = min(i + chunk_size, total_rows)
	chunk_args = {
	k: self._slice_argument(v, i, end)
	for k, v in arguments.items()
	}
	chunk_args_gpu = {
	k: self._move_tensors(v, self.math_device)
	for k, v in chunk_args.items()
	}
	chunk_res = task.execute(**chunk_args_gpu)
	results.append(self._move_tensors(chunk_res, self.storage_device))
	del chunk_args, chunk_args_gpu, chunk_res

	if ENABLE_AGGRESSIVE_CLEANUP:
	torch.cuda.empty_cache()

	if isinstance(results[0], torch.Tensor):
	return torch.cat(results, dim=0)
	elif isinstance(results[0], dict):
	out = {}
	for k in results[0].keys():
	out[k] = torch.cat([r[k] for r in results], dim=0)
	return out
	return results

	except torch.OutOfMemoryError:
	torch.cuda.empty_cache()
	gc.collect()
	continue
	except Exception as e:
	LOG.warning(f"Fixed chunk {chunk_size} failed: {e}")
	break

	# Strategy 4: CPU fallback
	LOG.warning(f"All GPU strategies failed for {task_name}, using CPU")
	raise torch.OutOfMemoryError("Forcing CPU fallback")

	def _run(
	self,
	quiet: bool = False,
	desc: Optional[str] = None,
	) -> Iterator[Tuple[TaskHandle, Any]]:
	last_use_index = self.schedule.last_use_index

	values: Dict[TaskHandle, Any] = {}
	if self.cached_values:
	for task, value in self.cached_values.items():
	values[task] = value

	is_gpu_execution = self.math_device.type != "cpu"
	accelerator = get_torch_accelerator_module(self.math_device.type) if is_gpu_execution else None

	for idx, task_handle in (
	pbar := tqdm.tqdm(
	list(enumerate(self.schedule.tasks)),
	disable=quiet,
	desc=desc or "Executing graph",
	)
	):
	task = task_handle.task()
	task_type = type(task).__name__

	# Log memory stats periodically
	if is_gpu_execution and idx % 10 == 0:
	stats = self._get_memory_stats()
	LOG.debug(
	f"Memory: {stats.get('allocated_gb', 0):.2f}GB allocated, "
	f"{stats.get('free_gb', 0):.2f}GB free of {stats.get('total_gb', 0):.2f}GB"
	)

	# Determine execution strategy
	is_cpu_only_task = task_type in CPU_ONLY_TASKS
	want_gpu = is_gpu_execution and task.uses_accelerator() and not is_cpu_only_task

	# Collect arguments
	arguments = {k: values[h] for k, h in task_handle.arguments().items()}

	success = False

	# Try GPU execution
	if want_gpu:
	try:
	res = self._execute_with_fallback(task, arguments, accelerator)
	values[task_handle] = res
	success = True
	except torch.OutOfMemoryError:
	LOG.warning(f"All GPU strategies exhausted for {task_type}, falling back to CPU")
	success = False
	except Exception as e:
	LOG.error(f"GPU execution failed for {task_type}: {e}")
	success = False

	# Cleanup after GPU attempt
	if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
	gc.collect()
	if accelerator:
	accelerator.empty_cache()

	# CPU fallback
	if not success:
	if want_gpu:
	LOG.info(f"Executing {task_type} on CPU")

	# Ensure cleanup before CPU execution
	if is_gpu_execution:
	gc.collect()
	if accelerator:
	accelerator.empty_cache()

	# Move arguments to CPU
	cpu_arguments = {
	k: self._move_tensors(v, torch.device("cpu"))
	for k, v in arguments.items()
	}

	res = task.execute(**cpu_arguments)
	del cpu_arguments
	res = self._move_tensors(res, self.storage_device)
	values[task_handle] = res

	del res
	del arguments

	if task_handle in self.targets:
	yield (task_handle, values[task_handle])

	# Evict unreferenced values
	expired = []
	for key in values:
	if idx >= last_use_index[key]:
	expired.append(key)
	for key in expired:
	del values[key]

	# Periodic cleanup (measure.py strategy)
	self._task_counter += 1
	if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
	if CLEANUP_FREQUENCY == 0 or self._task_counter % max(1, CLEANUP_FREQUENCY) == 0:
	gc.collect()
	if accelerator:
	accelerator.empty_cache()

	del values
	del pbar

	def run(
	self,
	quiet: bool = False,
	desc: Optional[str] = None,
	) -> Iterator[Tuple[Task, Any]]:
	for handle, value in self._run(quiet=quiet, desc=desc):
	yield (handle.task(), value)

	def execute(self, desc: Optional[str] = None) -> None:
	for _ in self.run(desc=desc):
	pass

	def _move_tensors(
	self, value: Any, device: torch.device, non_blocking: Optional[bool] = None
	) -> Any:
	"""Move tensors to specified device, handling nested structures."""
	if non_blocking is None:
	non_blocking = device.type in ["cuda", "xpu"]

	if isinstance(value, torch.Tensor):
	if value.device == device:
	return value
	return value.to(device=device, non_blocking=non_blocking)
	elif isinstance(value, dict):
	return {
	k: self._move_tensors(v, device, non_blocking)
	for k, v in value.items()
	}
	elif isinstance(value, list):
	return [self._move_tensors(v, device, non_blocking) for v in value]
	elif isinstance(value, tuple):
	return tuple(self._move_tensors(v, device, non_blocking) for v in value)

	return value