pingnie commited on
Commit
3655a9e
1 Parent(s): 2acbfac

add gpu info

Browse files
Files changed (1) hide show
  1. src/utils.py +36 -27
src/utils.py CHANGED
@@ -2,6 +2,8 @@ import pandas as pd
2
  from huggingface_hub import snapshot_download
3
  import subprocess
4
  import re
 
 
5
  try:
6
  from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
7
  except:
@@ -40,38 +42,45 @@ def get_dataset_summary_table(file_path):
40
  return df
41
 
42
  def parse_nvidia_smi():
43
- # Execute the nvidia-smi command
44
- result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
45
- output = result.stdout.strip()
46
-
47
- # Initialize data storage
 
 
 
 
 
 
48
  gpu_stats = []
49
 
50
- # Regex to extract the relevant data for each GPU
51
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
52
  gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
53
- lines = output.split('\n')
54
  gpu_name = ""
55
- for line in lines:
56
- match = gpu_info_pattern.search(line)
57
- name_match = gpu_name_pattern.search(line)
58
-
59
- gpu_info = {}
60
-
61
- if name_match:
62
- # print(name_match)
63
- gpu_name = name_match.group(1).strip()
64
- if match:
65
- temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
66
- gpu_info.update({
67
- GPU_TEMP: temp,
68
- GPU_Power: power_usage,
69
- GPU_Mem: mem_usage,
70
- GPU_Util: gpu_util
71
- })
72
- # print(f"gpu_info: {gpu_info}")
73
- if len(gpu_info) >= 4:
74
- gpu_stats.append(gpu_info)
 
 
75
  gpu_name = f"{len(gpu_stats)}x{gpu_name}"
76
  gpu_stats_total = {
77
  GPU_TEMP: 0,
 
2
  from huggingface_hub import snapshot_download
3
  import subprocess
4
  import re
5
+ import os
6
+
7
  try:
8
  from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
9
  except:
 
42
  return df
43
 
44
  def parse_nvidia_smi():
45
+ visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
46
+ if visible_devices is not None:
47
+ gpu_indices = visible_devices.split(',')
48
+ else:
49
+ # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
50
+ result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
51
+ if result.returncode != 0:
52
+ print("Failed to query GPU indices.")
53
+ return []
54
+ gpu_indices = result.stdout.strip().split('\n')
55
+ print(f"gpu_indices: {gpu_indices}")
56
  gpu_stats = []
57
 
 
58
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
59
  gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
60
+
61
  gpu_name = ""
62
+ for index in gpu_indices:
63
+ result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
64
+ output = result.stdout.strip()
65
+ lines = output.split("\n")
66
+ for line in lines:
67
+ match = gpu_info_pattern.search(line)
68
+ name_match = gpu_name_pattern.search(line)
69
+ gpu_info = {}
70
+ if name_match:
71
+ gpu_name = name_match.group(1).strip()
72
+ if match:
73
+ temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
74
+ gpu_info.update({
75
+ GPU_TEMP: temp,
76
+ GPU_Power: power_usage,
77
+ GPU_Mem: mem_usage,
78
+ GPU_Util: gpu_util
79
+ })
80
+
81
+ if len(gpu_info) >= 4:
82
+ gpu_stats.append(gpu_info)
83
+ print(f"len(gpu_stats): {len(gpu_stats)}")
84
  gpu_name = f"{len(gpu_stats)}x{gpu_name}"
85
  gpu_stats_total = {
86
  GPU_TEMP: 0,