test / modules /api /nvml.py
bilegentile's picture
Upload folder using huggingface_hub
c19ca42 verified
try:
import pynvml as nv
nvml_ok = True
except ImportError:
nvml_ok = False
nvml_initialized = False
def get_reason(val):
throttle = {
1: 'gpu idle',
2: 'applications clocks setting',
4: 'sw power cap',
8: 'hw slowdown',
16: 'sync boost',
32: 'sw thermal slowdown',
64: 'hw thermal slowdown',
128: 'hw power brake slowdown',
256: 'display clock setting',
}
reason = ', '.join([throttle[i] for i in throttle if i & val])
return reason if len(reason) > 0 else 'ok'
def get_nvml():
global nvml_initialized # pylint: disable=global-statement
global nvml_ok # pylint: disable=global-statement
if not nvml_ok:
return []
try:
if not nvml_initialized:
nvml_initialized = True
nv.nvmlInit()
devices = []
for i in range(nv.nvmlDeviceGetCount()):
dev = nv.nvmlDeviceGetHandleByIndex(i)
device = {
'name': nv.nvmlDeviceGetName(dev),
'version': {
'cuda': nv.nvmlSystemGetCudaDriverVersion(),
'driver': nv.nvmlSystemGetDriverVersion(),
'vbios': nv.nvmlDeviceGetVbiosVersion(dev),
'rom': nv.nvmlDeviceGetInforomImageVersion(dev),
'capabilities': nv.nvmlDeviceGetCudaComputeCapability(dev),
},
'pci': {
'link': nv.nvmlDeviceGetCurrPcieLinkGeneration(dev),
'width': nv.nvmlDeviceGetCurrPcieLinkWidth(dev),
'busid': nv.nvmlDeviceGetPciInfo(dev).busId,
'deviceid': nv.nvmlDeviceGetPciInfo(dev).pciDeviceId,
},
'memory': {
'total': round(nv.nvmlDeviceGetMemoryInfo(dev).total/1024/1024, 2),
'free': round(nv.nvmlDeviceGetMemoryInfo(dev).free/1024/1024,2),
'used': round(nv.nvmlDeviceGetMemoryInfo(dev).used/1024/1024,2),
},
'clock': { # gpu, sm, memory
'gpu': [nv.nvmlDeviceGetClockInfo(dev, 0), nv.nvmlDeviceGetMaxClockInfo(dev, 0)],
'sm': [nv.nvmlDeviceGetClockInfo(dev, 1), nv.nvmlDeviceGetMaxClockInfo(dev, 1)],
'memory': [nv.nvmlDeviceGetClockInfo(dev, 2), nv.nvmlDeviceGetMaxClockInfo(dev, 2)],
},
'load': {
'gpu': round(nv.nvmlDeviceGetUtilizationRates(dev).gpu),
'memory': round(nv.nvmlDeviceGetUtilizationRates(dev).memory),
'temp': nv.nvmlDeviceGetTemperature(dev, 0),
'fan': nv.nvmlDeviceGetFanSpeed(dev),
},
'power': [round(nv.nvmlDeviceGetPowerUsage(dev)/1000, 2), round(nv.nvmlDeviceGetEnforcedPowerLimit(dev)/1000, 2)],
'state': get_reason(nv.nvmlDeviceGetCurrentClocksThrottleReasons(dev)),
}
devices.append(device)
# log.debug(f'nmvl: {devices}')
return devices
except Exception:
# log.debug(f'nvml failed: {e}')
nvml_ok = False
return []