UMMJ
/

JSX_TTS

Model card Files Files and versions Community

JSX_TTS / torch /cuda /memory.py

UMMJ

Upload 5875 files

9dd3461 over 2 years ago

raw

history blame contribute delete

24 kB

	import collections
	import contextlib
	import warnings
	from typing import Any, Dict, Union, Tuple

	import torch
	from . import is_initialized, _get_device_index, _lazy_init

	from ._memory_viz import segments as _segments, memory as _memory

	from torch.types import Device
	from torch import _C

	__all__ = ["caching_allocator_alloc", "caching_allocator_delete", "set_per_process_memory_fraction",
	"empty_cache", "memory_stats", "memory_stats_as_nested_dict", "reset_accumulated_memory_stats",
	"reset_peak_memory_stats", "reset_max_memory_allocated", "reset_max_memory_cached",
	"memory_allocated", "max_memory_allocated", "memory_reserved", "max_memory_reserved",
	"memory_cached", "max_memory_cached", "memory_snapshot", "memory_summary", "list_gpu_processes",
	"mem_get_info"]

	def _host_allocator():
	_lazy_init()
	return torch._C._cuda_cudaHostAllocator()


	@contextlib.contextmanager
	def _free_mutex():
	torch._C._cuda_lock_mutex()
	try:
	yield
	finally:
	torch._C._cuda_unlock_mutex()


	def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
	r"""Performs a memory allocation using the CUDA memory allocator.

	Memory is allocated for a given device and a stream, this
	function is intended to be used for interoperability with other
	frameworks. Allocated memory is released through
	:func:`~torch.cuda.caching_allocator_delete`.

	Args:
	size (int): number of bytes to be allocated.
	device (torch.device or int, optional): selected device. If it is
	``None`` the default CUDA device is used.
	stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
	the default stream for the selected device is used.

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	if device is None:
	device = torch.cuda.current_device()
	device = _get_device_index(device)
	if stream is None:
	stream = torch.cuda.current_stream(device)
	if isinstance(stream, torch.cuda.streams.Stream):
	stream = stream.cuda_stream
	if not isinstance(stream, int):
	raise TypeError('Invalid type for stream argument, must be '
	'`torch.cuda.Stream` or `int` representing a pointer '
	'to a exisiting stream')
	with torch.cuda.device(device):
	return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)


	def caching_allocator_delete(mem_ptr):
	r"""Deletes memory allocated using the CUDA memory allocator.

	Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
	is freed here. The associated device and stream are tracked inside
	the allocator.

	Args:
	mem_ptr (int): memory address to be freed by the allocator.

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)


	def set_per_process_memory_fraction(fraction, device: Union[Device, int] = None) -> None:
	r"""Set memory fraction for a process.
	The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
	The allowed value equals the total visible memory multiplied fraction.
	If trying to allocate more than the allowed value in a process, will raise an out of
	memory error in allocator.

	Args:
	fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
	device (torch.device or int, optional): selected device. If it is
	``None`` the default CUDA device is used.
	.. note::
	In general, the total available free memory is less than the total capacity.
	"""
	_lazy_init()
	if device is None:
	device = torch.cuda.current_device()
	device = _get_device_index(device)
	if not isinstance(fraction, float):
	raise TypeError('Invalid type for fraction argument, must be `float`')
	if fraction < 0 or fraction > 1:
	raise ValueError('Invalid fraction value: {}. '
	'Allowed range: 0~1'.format(fraction))

	torch._C._cuda_setMemoryFraction(fraction, device)


	def empty_cache() -> None:
	r"""Releases all unoccupied cached memory currently held by the caching
	allocator so that those can be used in other GPU application and visible in
	`nvidia-smi`.

	.. note::
	:func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
	memory available for PyTorch. However, it may help reduce fragmentation
	of GPU memory in certain cases. See :ref:`cuda-memory-management` for
	more details about GPU memory management.
	"""
	if is_initialized():
	torch._C._cuda_emptyCache()


	def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
	r"""Returns a dictionary of CUDA memory allocator statistics for a
	given device.

	The return value of this function is a dictionary of statistics, each of
	which is a non-negative integer.

	Core statistics:

	- ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	number of allocation requests received by the memory allocator.
	- ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	amount of allocated memory.
	- ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	number of reserved segments from ``cudaMalloc()``.
	- ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	amount of reserved memory.
	- ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	number of active memory blocks.
	- ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	amount of active memory.
	- ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	number of inactive, non-releasable memory blocks.
	- ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
	amount of inactive, non-releasable memory.

	For these core statistics, values are broken down as follows.

	Pool type:

	- ``all``: combined statistics across all memory pools.
	- ``large_pool``: statistics for the large allocation pool
	(as of October 2019, for size >= 1MB allocations).
	- ``small_pool``: statistics for the small allocation pool
	(as of October 2019, for size < 1MB allocations).

	Metric type:

	- ``current``: current value of this metric.
	- ``peak``: maximum value of this metric.
	- ``allocated``: historical total increase in this metric.
	- ``freed``: historical total decrease in this metric.

	In addition to the core statistics, we also provide some simple event
	counters:

	- ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
	result in a cache flush and retry.
	- ``"num_ooms"``: number of out-of-memory errors thrown.

	The caching allocator can be configured via ENV to not split blocks larger than a
	defined size (see Memory Management section of the Cuda Semantics documentation).
	This helps avoid memory framentation but may have a performance
	penalty. Additional outputs to assist with tuning and evaluating impact:

	- ``"max_split_size"``: blocks above this size will not be split.
	- ``"oversize_allocations.{current,peak,allocated,freed}"``:
	number of over-size allocation requests received by the memory allocator.
	- ``"oversize_segments.{current,peak,allocated,freed}"``:
	number of over-size reserved segments from ``cudaMalloc()``.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistics for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	result = []

	def _recurse_add_to_result(prefix, obj):
	if isinstance(obj, dict):
	if len(prefix) > 0:
	prefix += "."
	for k, v in obj.items():
	_recurse_add_to_result(prefix + k, v)
	else:
	result.append((prefix, obj))

	stats = memory_stats_as_nested_dict(device=device)
	_recurse_add_to_result("", stats)
	result.sort()

	return collections.OrderedDict(result)


	def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
	r"""Returns the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
	if not is_initialized():
	return {}
	device = _get_device_index(device, optional=True)
	return torch._C._cuda_memoryStats(device)


	def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
	r"""Resets the "accumulated" (historical) stats tracked by the CUDA memory allocator.

	See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
	the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
	`"num_alloc_retries"` and `"num_ooms"`.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	device = _get_device_index(device, optional=True)
	return torch._C._cuda_resetAccumulatedMemoryStats(device)


	def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
	r"""Resets the "peak" stats tracked by the CUDA memory allocator.

	See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
	`"peak"` key in each individual stat dict.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	device = _get_device_index(device, optional=True)
	return torch._C._cuda_resetPeakMemoryStats(device)


	def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
	r"""Resets the starting point in tracking maximum GPU memory occupied by
	tensors for a given device.

	See :func:`~torch.cuda.max_memory_allocated` for details.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. warning::
	This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
	/all/ peak memory stats.

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	warnings.warn(
	"torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
	"which resets /all/ peak memory stats.",
	FutureWarning)
	return reset_peak_memory_stats(device=device)


	def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
	r"""Resets the starting point in tracking maximum GPU memory managed by the
	caching allocator for a given device.

	See :func:`~torch.cuda.max_memory_cached` for details.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. warning::
	This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
	/all/ peak memory stats.

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	warnings.warn(
	"torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
	"which resets /all/ peak memory stats.",
	FutureWarning)
	return reset_peak_memory_stats(device=device)


	def memory_allocated(device: Union[Device, int] = None) -> int:
	r"""Returns the current GPU memory occupied by tensors in bytes for a given
	device.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	This is likely less than the amount shown in `nvidia-smi` since some
	unused memory can be held by the caching allocator and some context
	needs to be created on GPU. See :ref:`cuda-memory-management` for more
	details about GPU memory management.
	"""
	return memory_stats(device=device).get("allocated_bytes.all.current", 0)


	def max_memory_allocated(device: Union[Device, int] = None) -> int:
	r"""Returns the maximum GPU memory occupied by tensors in bytes for a given
	device.

	By default, this returns the peak allocated memory since the beginning of
	this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
	reset the starting point in tracking this metric. For example, these two
	functions can measure the peak allocated memory usage of each iteration in a
	training loop.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	return memory_stats(device=device).get("allocated_bytes.all.peak", 0)


	def memory_reserved(device: Union[Device, int] = None) -> int:
	r"""Returns the current GPU memory managed by the caching allocator in bytes
	for a given device.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	return memory_stats(device=device).get("reserved_bytes.all.current", 0)


	def max_memory_reserved(device: Union[Device, int] = None) -> int:
	r"""Returns the maximum GPU memory managed by the caching allocator in bytes
	for a given device.

	By default, this returns the peak cached memory since the beginning of this
	program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
	the starting point in tracking this metric. For example, these two functions
	can measure the peak cached memory amount of each iteration in a training
	loop.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	return memory_stats(device=device).get("reserved_bytes.all.peak", 0)


	def memory_cached(device: Union[Device, int] = None) -> int:
	r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
	warnings.warn(
	"torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
	FutureWarning)
	return memory_reserved(device=device)


	def max_memory_cached(device: Union[Device, int] = None) -> int:
	r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
	warnings.warn(
	"torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
	FutureWarning)
	return max_memory_reserved(device=device)


	def memory_snapshot():
	r"""Returns a snapshot of the CUDA memory allocator state across all devices.

	Interpreting the output of this function requires familiarity with the
	memory allocator internals.

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	return torch._C._cuda_memorySnapshot()


	def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
	r"""Returns a human-readable printout of the current memory allocator
	statistics for a given device.

	This can be useful to display periodically during training, or when
	handling out-of-memory exceptions.

	Args:
	device (torch.device or int, optional): selected device. Returns
	printout for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).
	abbreviated (bool, optional): whether to return an abbreviated summary
	(default: False).

	.. note::
	See :ref:`cuda-memory-management` for more details about GPU memory
	management.
	"""
	device = _get_device_index(device, optional=True)
	stats = memory_stats(device=device)

	def _format_size(sz, pref_sz):
	prefixes = ["B ", "KB", "MB", "GB", "TB", "PB"]
	prefix = prefixes[0]
	for new_prefix in prefixes[1:]:
	if pref_sz < 768 * 1024:
	break
	prefix = new_prefix
	sz //= 1024
	pref_sz /= 1024
	return "{:7d} {}".format(sz, prefix)

	def _format_count(cnt, pref_cnt):
	prefixes = [" ", "K", "M"]
	prefix = prefixes[0]
	for new_prefix in prefixes[1:]:
	if pref_cnt < 750 * 1000:
	break
	prefix = new_prefix
	cnt //= 1000
	pref_cnt /= 1000
	return "{:7d} {} ".format(cnt, prefix)

	metrics_to_display = [
	("allocated_bytes", "Allocated memory", _format_size),
	("active_bytes", "Active memory", _format_size),
	("reserved_bytes", "GPU reserved memory", _format_size),
	("inactive_split_bytes", "Non-releasable memory", _format_size),
	("allocation", "Allocations", _format_count),
	("active", "Active allocs", _format_count),
	("segment", "GPU reserved segments", _format_count),
	("inactive_split", "Non-releasable allocs", _format_count),
	]

	lines = []
	lines.append("=" * 75)
	lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
	lines.append("-" * 75)
	lines.append(" {_:9} CUDA OOMs: {num_ooms:<12d} \| {_:6} cudaMalloc retries: {num_alloc_retries:<8d} ")
	lines.append("=" * 75)
	lines.append(" Metric \| Cur Usage \| Peak Usage \| Tot Alloc \| Tot Freed ")

	for metric_key, metric_name, formatter in metrics_to_display:
	lines.append("-" * 75)
	submetrics = [("all", metric_name)]
	if not abbreviated:
	submetrics.append(("large_pool", " from large pool"))
	submetrics.append(("small_pool", " from small pool"))

	current_prefval, peak_prefval, allocated_prefval, freed_prefval = None, None, None, None

	for submetric_key, submetric_name in submetrics:
	prefix = metric_key + "." + submetric_key + "."

	current = stats[prefix + "current"]
	peak = stats[prefix + "peak"]
	allocated = stats[prefix + "allocated"]
	freed = stats[prefix + "freed"]

	if current_prefval is None:
	current_prefval = current
	peak_prefval = peak
	allocated_prefval = allocated
	freed_prefval = freed

	lines.append(" {:<21} \| {} \| {} \| {} \| {} ".format(
	submetric_name,
	formatter(current, current_prefval),
	formatter(peak, peak_prefval),
	formatter(allocated, allocated_prefval),
	formatter(freed, freed_prefval)),
	)

	metrics_to_display = [
	("oversize_allocations", "Oversize allocations", _format_count),
	("oversize_segments", "Oversize GPU segments", _format_count),
	]

	for metric_key, metric_name, formatter in metrics_to_display:
	lines.append("-" * 75)

	prefix = metric_key + "."

	current = stats[prefix + "current"]
	peak = stats[prefix + "peak"]
	allocated = stats[prefix + "allocated"]
	freed = stats[prefix + "freed"]

	lines.append(" {:<21} \| {} \| {} \| {} \| {} ".format(
	metric_name,
	formatter(current, current),
	formatter(peak, peak),
	formatter(allocated, allocated),
	formatter(freed, freed)),
	)

	lines.append("=" * 75)

	fmt_dict = {"_": "", "device": device}
	for k, v in stats.items():
	fmt_dict[k.replace(".", "-")] = v
	return "\|" + "\|\n\|".join(lines).format(**fmt_dict) + "\|\n"


	def list_gpu_processes(device: Union[Device, int] = None) -> str:
	r"""Returns a human-readable printout of the running processes
	and their GPU memory use for a given device.

	This can be useful to display periodically during training, or when
	handling out-of-memory exceptions.

	Args:
	device (torch.device or int, optional): selected device. Returns
	printout for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).
	"""

	try:
	import pynvml # type: ignore[import]
	except ModuleNotFoundError:
	return("pynvml module not found, please install pynvml")
	from pynvml import NVMLError_DriverNotLoaded
	try:
	pynvml.nvmlInit()
	except NVMLError_DriverNotLoaded:
	return ("cuda driver can't be loaded, is cuda enabled?")
	device = _get_device_index(device, optional=True)
	handle = pynvml.nvmlDeviceGetHandleByIndex(device)
	procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
	lines = []
	lines.append(f"GPU:{device}")
	if len(procs) == 0:
	lines.append("no processes are running")
	for p in procs:
	mem = p.usedGpuMemory / (1024 * 1024)
	lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
	return "\n".join(lines)

	def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
	r"""Returns the global free and total GPU memory occupied for a given
	device using cudaMemGetInfo.

	Args:
	device (torch.device or int, optional): selected device. Returns
	statistic for the current device, given by :func:`~torch.cuda.current_device`,
	if :attr:`device` is ``None`` (default).

	.. note::
	See :ref:`cuda-memory-management` for more
	details about GPU memory management.
	"""
	if device is None:
	device = torch.cuda.current_device()
	device = _get_device_index(device)
	return torch.cuda.cudart().cudaMemGetInfo(device)

	def _record_memory_history(enabled: bool, device: Union[Device, int] = None):
	with torch.cuda.device(device):
	_C._cuda_recordMemoryHistory(enabled)

	def _snapshot(device: Union[Device, int] = None):
	with torch.cuda.device(device):
	return _C._cuda_memorySnapshot()

	def _save_segment_usage(filename='output.svg', snapshot=None):
	if snapshot is None:
	snapshot = memory_snapshot()
	with open(filename, 'w') as f:
	f.write(_segments(snapshot))

	def _save_memory_usage(filename='output.svg', snapshot=None):
	if snapshot is None:
	snapshot = memory_snapshot()
	with open(filename, 'w') as f:
	f.write(_memory(snapshot))

	def _set_allocator_settings(env: str):
	return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)