Spaces:
Runtime error
Runtime error
File size: 6,107 Bytes
2e82449 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# from modules import shared
# from modules.sd_hijack_utils import CondFunc
#
# has_ipex = False
# try:
# import torch
# import intel_extension_for_pytorch as ipex # noqa: F401
# has_ipex = True
# except Exception:
# pass
#
#
# def check_for_xpu():
# return has_ipex and hasattr(torch, 'xpu') and torch.xpu.is_available()
#
#
# def get_xpu_device_string():
# if shared.cmd_opts.device_id is not None:
# return f"xpu:{shared.cmd_opts.device_id}"
# return "xpu"
#
#
# def torch_xpu_gc():
# with torch.xpu.device(get_xpu_device_string()):
# torch.xpu.empty_cache()
#
#
# has_xpu = check_for_xpu()
#
#
# # Arc GPU cannot allocate a single block larger than 4GB: https://github.com/intel/compute-runtime/issues/627
# # Here we implement a slicing algorithm to split large batch size into smaller chunks,
# # so that SDPA of each chunk wouldn't require any allocation larger than ARC_SINGLE_ALLOCATION_LIMIT.
# # The heuristic limit (TOTAL_VRAM // 8) is tuned for Intel Arc A770 16G and Arc A750 8G,
# # which is the best trade-off between VRAM usage and performance.
# ARC_SINGLE_ALLOCATION_LIMIT = {}
# orig_sdp_attn_func = torch.nn.functional.scaled_dot_product_attention
# def torch_xpu_scaled_dot_product_attention(
# query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, *args, **kwargs
# ):
# # cast to same dtype first
# key = key.to(query.dtype)
# value = value.to(query.dtype)
# if attn_mask is not None and attn_mask.dtype != torch.bool:
# attn_mask = attn_mask.to(query.dtype)
#
# N = query.shape[:-2] # Batch size
# L = query.size(-2) # Target sequence length
# E = query.size(-1) # Embedding dimension of the query and key
# S = key.size(-2) # Source sequence length
# Ev = value.size(-1) # Embedding dimension of the value
#
# total_batch_size = torch.numel(torch.empty(N))
# device_id = query.device.index
# if device_id not in ARC_SINGLE_ALLOCATION_LIMIT:
# ARC_SINGLE_ALLOCATION_LIMIT[device_id] = min(torch.xpu.get_device_properties(device_id).total_memory // 8, 4 * 1024 * 1024 * 1024)
# batch_size_limit = max(1, ARC_SINGLE_ALLOCATION_LIMIT[device_id] // (L * S * query.element_size()))
#
# if total_batch_size <= batch_size_limit:
# return orig_sdp_attn_func(
# query,
# key,
# value,
# attn_mask,
# dropout_p,
# is_causal,
# *args, **kwargs
# )
#
# query = torch.reshape(query, (-1, L, E))
# key = torch.reshape(key, (-1, S, E))
# value = torch.reshape(value, (-1, S, Ev))
# if attn_mask is not None:
# attn_mask = attn_mask.view(-1, L, S)
# chunk_count = (total_batch_size + batch_size_limit - 1) // batch_size_limit
# outputs = []
# for i in range(chunk_count):
# attn_mask_chunk = (
# None
# if attn_mask is None
# else attn_mask[i * batch_size_limit : (i + 1) * batch_size_limit, :, :]
# )
# chunk_output = orig_sdp_attn_func(
# query[i * batch_size_limit : (i + 1) * batch_size_limit, :, :],
# key[i * batch_size_limit : (i + 1) * batch_size_limit, :, :],
# value[i * batch_size_limit : (i + 1) * batch_size_limit, :, :],
# attn_mask_chunk,
# dropout_p,
# is_causal,
# *args, **kwargs
# )
# outputs.append(chunk_output)
# result = torch.cat(outputs, dim=0)
# return torch.reshape(result, (*N, L, Ev))
#
#
# def is_xpu_device(device: str | torch.device = None):
# if device is None:
# return False
# if isinstance(device, str):
# return device.startswith("xpu")
# return device.type == "xpu"
#
#
# if has_xpu:
# try:
# # torch.Generator supports "xpu" device since 2.1
# torch.Generator("xpu")
# except RuntimeError:
# # W/A for https://github.com/intel/intel-extension-for-pytorch/issues/452: torch.Generator API doesn't support XPU device (for torch < 2.1)
# CondFunc('torch.Generator',
# lambda orig_func, device=None: torch.xpu.Generator(device),
# lambda orig_func, device=None: is_xpu_device(device))
#
# # W/A for some OPs that could not handle different input dtypes
# CondFunc('torch.nn.functional.layer_norm',
# lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
# orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
# lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
# weight is not None and input.dtype != weight.data.dtype)
# CondFunc('torch.nn.modules.GroupNorm.forward',
# lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
# lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
# CondFunc('torch.nn.modules.linear.Linear.forward',
# lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
# lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
# CondFunc('torch.nn.modules.conv.Conv2d.forward',
# lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
# lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
# CondFunc('torch.bmm',
# lambda orig_func, input, mat2, out=None: orig_func(input.to(mat2.dtype), mat2, out=out),
# lambda orig_func, input, mat2, out=None: input.dtype != mat2.dtype)
# CondFunc('torch.cat',
# lambda orig_func, tensors, dim=0, out=None: orig_func([t.to(tensors[0].dtype) for t in tensors], dim=dim, out=out),
# lambda orig_func, tensors, dim=0, out=None: not all(t.dtype == tensors[0].dtype for t in tensors))
# CondFunc('torch.nn.functional.scaled_dot_product_attention',
# lambda orig_func, *args, **kwargs: torch_xpu_scaled_dot_product_attention(*args, **kwargs),
# lambda orig_func, query, *args, **kwargs: query.is_xpu)
|