AppleSwing commited on
Commit
08b56fc
2 Parent(s): 86b14ca c3fc5ce

Merge branch 'main' into pr/15

Browse files
app.py CHANGED
@@ -1,5 +1,4 @@
1
  #!/usr/bin/env python
2
-
3
  import os
4
  import datetime
5
  import socket
 
1
  #!/usr/bin/env python
 
2
  import os
3
  import datetime
4
  import socket
backend-cli.py CHANGED
@@ -6,6 +6,7 @@ import argparse
6
 
7
  import socket
8
  import random
 
9
  from datetime import datetime
10
 
11
  from src.backend.run_eval_suite import run_evaluation
@@ -16,7 +17,7 @@ from src.backend.manage_requests import EvalRequest
16
  from src.leaderboard.read_evals import EvalResult
17
 
18
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
19
- from src.utils import my_snapshot_download
20
 
21
  from src.leaderboard.read_evals import get_raw_eval_results
22
 
@@ -123,7 +124,17 @@ def request_to_result_name(request: EvalRequest) -> str:
123
 
124
 
125
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
126
- batch_size = 4
 
 
 
 
 
 
 
 
 
 
127
  try:
128
  results = run_evaluation(
129
  eval_request=eval_request,
@@ -150,6 +161,20 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
150
  raise
151
 
152
  # print("RESULTS", results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
155
  # print(dumped)
@@ -396,9 +421,9 @@ def get_args():
396
  parser = argparse.ArgumentParser(description="Run the backend")
397
  parser.add_argument("--debug", action="store_true", help="Run in debug mode")
398
  # debug parameters
399
- parser.add_argument("--task", type=str, default="selfcheckgpt", help="Task to debug")
400
- parser.add_argument("--model", type=str, default="facebook/opt-1.3b", help="Model to debug")
401
- parser.add_argument("--precision", type=str, default="float16", help="Precision to debug")
402
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
403
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
404
  return parser.parse_args()
@@ -409,23 +434,31 @@ if __name__ == "__main__":
409
  local_debug = args.debug
410
  # debug specific task by ping
411
  if local_debug:
412
- debug_model_names = [args.model] # Use model from arguments
413
- debug_task_name = args.task # Use task from arguments
 
 
 
 
414
  task_lst = TASKS_HARNESS.copy()
415
- for task in task_lst:
416
  for debug_model_name in debug_model_names:
417
- task_name = task.benchmark
418
- if task_name != debug_task_name:
419
- continue
420
- eval_request = EvalRequest(
421
- model=debug_model_name,
422
- private=False,
423
- status="",
424
- json_filepath="",
425
- precision=args.precision, # Use precision from arguments
426
- inference_framework=args.inference_framework # Use inference framework from arguments
427
- )
428
- results = process_evaluation(task, eval_request, limit=args.limit)
 
 
 
 
429
  else:
430
  while True:
431
  res = False
 
6
 
7
  import socket
8
  import random
9
+ import threading
10
  from datetime import datetime
11
 
12
  from src.backend.run_eval_suite import run_evaluation
 
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
20
+ from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
 
124
 
125
 
126
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
127
+ batch_size = 1
128
+ batch_size = eval_request.batch_size
129
+
130
+ init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
131
+ # if init_gpu_info['Mem(M)'] > 500:
132
+ # assert False, f"This machine is not empty: {init_gpu_info}"
133
+ gpu_stats_list = []
134
+ stop_event = threading.Event()
135
+ monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
136
+ monitor_thread.start()
137
+
138
  try:
139
  results = run_evaluation(
140
  eval_request=eval_request,
 
161
  raise
162
 
163
  # print("RESULTS", results)
164
+ stop_event.set()
165
+ monitor_thread.join()
166
+ gpu_info = analyze_gpu_stats(gpu_stats_list)
167
+ for task_name in results['results'].keys():
168
+ for key, value in gpu_info.items():
169
+ if "GPU" not in key:
170
+ results['results'][task_name][f"{key},none"] = int(value)
171
+ else:
172
+ results['results'][task_name][f"{key},none"] = value
173
+
174
+ results['results'][task_name]['batch_size,none'] = batch_size
175
+ results['results'][task_name]['precision,none'] = eval_request.precision
176
+ print(f"gpu_stats_list: {gpu_stats_list}")
177
+ print("GPU Usage:", gpu_info)
178
 
179
  dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
180
  # print(dumped)
 
421
  parser = argparse.ArgumentParser(description="Run the backend")
422
  parser.add_argument("--debug", action="store_true", help="Run in debug mode")
423
  # debug parameters
424
+ parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu", help="Task to debug")
425
+ parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
426
+ parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
427
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
428
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
429
  return parser.parse_args()
 
434
  local_debug = args.debug
435
  # debug specific task by ping
436
  if local_debug:
437
+ # debug_model_names = [args.model] # Use model from arguments
438
+ # debug_task_name = [args.task] # Use task from arguments
439
+ debug_model_names = args.model.split(",")
440
+ debug_task_name = args.task.split(",")
441
+ precisions = args.precision.split(",")
442
+ print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
443
  task_lst = TASKS_HARNESS.copy()
444
+ for precision in precisions:
445
  for debug_model_name in debug_model_names:
446
+ for task in task_lst:
447
+ task_name = task.benchmark
448
+ if task_name not in debug_task_name:
449
+ continue
450
+ try:
451
+ eval_request = EvalRequest(
452
+ model=debug_model_name,
453
+ private=False,
454
+ status="",
455
+ json_filepath="",
456
+ precision=precision, # Use precision from arguments
457
+ inference_framework=args.inference_framework # Use inference framework from arguments
458
+ )
459
+ results = process_evaluation(task, eval_request, limit=args.limit)
460
+ except Exception as e:
461
+ print(f"debug running error: {e}")
462
  else:
463
  while True:
464
  res = False
src/backend/envs.py CHANGED
@@ -63,4 +63,4 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
63
  EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
64
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
65
 
66
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
63
  EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
64
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
65
 
66
+ DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
src/backend/hflm_with_measurement.py CHANGED
@@ -57,12 +57,12 @@ class StopWatch(TextStreamer):
57
  self.start_decoding = time()
58
  self.decoding_iterations += 1
59
  return
60
-
61
  def end(self):
62
  if self.decoding_time is None and self.start_decoding is not None:
63
  self.decoding_time = time() - self.start_decoding
64
  return
65
-
66
 
67
  class HFLMWithMeasurement(HFLM):
68
  def __init__(self, **kwargs):
@@ -287,7 +287,7 @@ class HFLMWithMeasurement(HFLM):
287
  pbar.close()
288
 
289
  return re_ord.get_original(res)
290
-
291
  def _model_generate(self, context, max_length, stop, **generation_kwargs):
292
  # temperature = 0.0 if not set
293
  # if do_sample is false and temp==0.0:
@@ -318,7 +318,7 @@ class HFLMWithMeasurement(HFLM):
318
  **generation_kwargs,
319
  )
320
  end = time()
321
-
322
  batch_size = context.shape[0]
323
  output_length = stop_watch.decoding_iterations
324
 
@@ -403,7 +403,7 @@ class HFLMWithMeasurement(HFLM):
403
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
404
  )
405
  # add EOS token to stop sequences
406
- eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
407
  if not until:
408
  until = [eos]
409
  else:
 
57
  self.start_decoding = time()
58
  self.decoding_iterations += 1
59
  return
60
+
61
  def end(self):
62
  if self.decoding_time is None and self.start_decoding is not None:
63
  self.decoding_time = time() - self.start_decoding
64
  return
65
+
66
 
67
  class HFLMWithMeasurement(HFLM):
68
  def __init__(self, **kwargs):
 
287
  pbar.close()
288
 
289
  return re_ord.get_original(res)
290
+
291
  def _model_generate(self, context, max_length, stop, **generation_kwargs):
292
  # temperature = 0.0 if not set
293
  # if do_sample is false and temp==0.0:
 
318
  **generation_kwargs,
319
  )
320
  end = time()
321
+
322
  batch_size = context.shape[0]
323
  output_length = stop_watch.decoding_iterations
324
 
 
403
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
404
  )
405
  # add EOS token to stop sequences
406
+ eos = self.tok_decode(self.eot_token_id)
407
  if not until:
408
  until = [eos]
409
  else:
src/backend/manage_requests.py CHANGED
@@ -27,24 +27,23 @@ class EvalRequest:
27
  likes: Optional[int] = 0
28
  params: Optional[int] = None
29
  license: Optional[str] = ""
 
30
 
31
  def get_model_args(self) -> str:
32
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
33
-
34
  if self.precision in ["float16", "float32", "bfloat16"]:
35
  model_args += f",dtype={self.precision}"
36
  # Quantized models need some added config, the install of bits and bytes, etc
37
  # elif self.precision == "8bit":
38
  # model_args += ",load_in_8bit=True"
39
- # elif self.precision == "4bit":
40
- # model_args += ",load_in_4bit=True"
41
  # elif self.precision == "GPTQ":
42
  # A GPTQ model does not need dtype to be specified,
43
  # it will be inferred from the config
44
- pass
45
  elif self.precision == "8bit":
46
  model_args += ",load_in_8bit=True"
47
- model_args += ",trust_remote_code=True"
48
  else:
49
  raise Exception(f"Unknown precision {self.precision}.")
50
  return model_args
 
27
  likes: Optional[int] = 0
28
  params: Optional[int] = None
29
  license: Optional[str] = ""
30
+ batch_size: Optional[int] = 1
31
 
32
  def get_model_args(self) -> str:
33
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
34
+ model_args += ",trust_remote_code=True,device_map=auto"
35
  if self.precision in ["float16", "float32", "bfloat16"]:
36
  model_args += f",dtype={self.precision}"
37
  # Quantized models need some added config, the install of bits and bytes, etc
38
  # elif self.precision == "8bit":
39
  # model_args += ",load_in_8bit=True"
40
+ elif self.precision == "4bit":
41
+ model_args += ",load_in_4bit=True"
42
  # elif self.precision == "GPTQ":
43
  # A GPTQ model does not need dtype to be specified,
44
  # it will be inferred from the config
 
45
  elif self.precision == "8bit":
46
  model_args += ",load_in_8bit=True"
 
47
  else:
48
  raise Exception(f"Unknown precision {self.precision}.")
49
  return model_args
src/backend/run_eval_suite.py CHANGED
@@ -13,7 +13,7 @@ orig_higher_is_better = ConfigurableTask.higher_is_better
13
  def process_results_decorator(func):
14
  def wrapper(self, doc, results, *args, **kwargs):
15
  processed_results = [r[0] for r in results]
16
-
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
13
  def process_results_decorator(func):
14
  def wrapper(self, doc, results, *args, **kwargs):
15
  processed_results = [r[0] for r in results]
16
+
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
src/display/utils.py CHANGED
@@ -13,6 +13,29 @@ TS = "T/s" #Decoding throughput (tok/s)
13
  InFrame = "Method" #"Inference framework"
14
  MULTIPLE_CHOICEs = ["mmlu"]
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  @dataclass
17
  class Task:
18
  benchmark: str
@@ -81,11 +104,17 @@ auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnConten
81
  for task in Tasks:
82
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
83
  # System performance metrics
84
- auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{E2Es}", "number", True)])
 
 
 
 
 
85
  if task.value.benchmark in MULTIPLE_CHOICEs:
86
  continue
87
- auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{PREs}", "number", True)])
88
- auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}-{TS}", "number", True)])
 
89
 
90
  # Model information
91
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
13
  InFrame = "Method" #"Inference framework"
14
  MULTIPLE_CHOICEs = ["mmlu"]
15
 
16
+ GPU_TEMP = 'Temp(C)'
17
+ GPU_Power = 'Power(W)'
18
+ GPU_Mem = 'Mem(G)'
19
+ GPU_Name = "GPU"
20
+ GPU_Util = 'Util(%)'
21
+ BATCH_SIZE = 'bs'
22
+ PRECISION = "Precision"
23
+ system_metrics_to_name_map = {
24
+ "end_to_end_time": f"{E2Es}",
25
+ "prefilling_time": f"{PREs}",
26
+ "decoding_throughput": f"{TS}",
27
+ }
28
+
29
+ gpu_metrics_to_name_map = {
30
+ GPU_Util: GPU_Util,
31
+ GPU_TEMP: GPU_TEMP,
32
+ GPU_Power: GPU_Power,
33
+ GPU_Mem: GPU_Mem,
34
+ "batch_size": BATCH_SIZE,
35
+ "precision": PRECISION,
36
+ GPU_Name: GPU_Name,
37
+ }
38
+
39
  @dataclass
40
  class Task:
41
  benchmark: str
 
104
  for task in Tasks:
105
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
106
  # System performance metrics
107
+ auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True)])
108
+ auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True)])
109
+ # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True)])
110
+ auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True)])
111
+ auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True)])
112
+ auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True)])
113
  if task.value.benchmark in MULTIPLE_CHOICEs:
114
  continue
115
+ # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False)])
116
+ auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True)])
117
+
118
 
119
  # Model information
120
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -103,6 +103,13 @@ class EvalResult:
103
 
104
  if to_add is True:
105
  multiplier = 100.0
 
 
 
 
 
 
 
106
  if "rouge" in metric and "truthful" not in benchmark:
107
  multiplier = 1.0
108
  if "squad" in benchmark:
@@ -111,6 +118,10 @@ class EvalResult:
111
  multiplier = 1.0
112
  if "throughput" in metric:
113
  multiplier = 1.0
 
 
 
 
114
  # print('RESULTS', data['results'])
115
  # print('XXX', benchmark, metric, value, multiplier)
116
  results[benchmark][metric] = value * multiplier
 
103
 
104
  if to_add is True:
105
  multiplier = 100.0
106
+ if "GPU" in metric:
107
+ results[benchmark][metric] = value
108
+ continue
109
+ if "precision" in metric:
110
+ results[benchmark][metric] = value
111
+ continue
112
+
113
  if "rouge" in metric and "truthful" not in benchmark:
114
  multiplier = 1.0
115
  if "squad" in benchmark:
 
118
  multiplier = 1.0
119
  if "throughput" in metric:
120
  multiplier = 1.0
121
+ if "batch_" in metric or "Mem" in metric or "Util" in metric:
122
+ multiplier = 1
123
+
124
+
125
  # print('RESULTS', data['results'])
126
  # print('XXX', benchmark, metric, value, multiplier)
127
  results[benchmark][metric] = value * multiplier
src/populate.py CHANGED
@@ -12,7 +12,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_
12
 
13
  from src.backend.envs import Tasks as BackendTasks
14
  from src.display.utils import Tasks
15
- from src.display.utils import E2Es, PREs, TS
16
 
17
  def get_leaderboard_df(
18
  results_path: str,
@@ -45,12 +45,7 @@ def get_leaderboard_df(
45
  bm = (task.benchmark, task.metric)
46
  name_to_bm_map[name] = bm
47
 
48
- # bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
49
- system_metrics_to_name_map = {
50
- "end_to_end_time": f"{E2Es}",
51
- "prefilling_time": f"{PREs}",
52
- "decoding_throughput": f"{TS}",
53
- }
54
 
55
  all_data_json = []
56
  for entry in all_data_json_:
@@ -63,6 +58,9 @@ def get_leaderboard_df(
63
  if sys_metric in entry[k]:
64
  new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
65
 
 
 
 
66
  all_data_json += [new_entry]
67
 
68
  # all_data_json.append(baseline_row)
 
12
 
13
  from src.backend.envs import Tasks as BackendTasks
14
  from src.display.utils import Tasks
15
+ from src.display.utils import system_metrics_to_name_map, gpu_metrics_to_name_map
16
 
17
  def get_leaderboard_df(
18
  results_path: str,
 
45
  bm = (task.benchmark, task.metric)
46
  name_to_bm_map[name] = bm
47
 
48
+
 
 
 
 
 
49
 
50
  all_data_json = []
51
  for entry in all_data_json_:
 
58
  if sys_metric in entry[k]:
59
  new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
60
 
61
+ for gpu_metric, metric_namne in gpu_metrics_to_name_map.items():
62
+ if gpu_metric in entry[k]:
63
+ new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric]
64
  all_data_json += [new_entry]
65
 
66
  # all_data_json.append(baseline_row)
src/utils.py CHANGED
@@ -1,6 +1,14 @@
1
  import pandas as pd
2
  from huggingface_hub import snapshot_download
 
 
 
3
 
 
 
 
 
 
4
 
5
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
6
  for i in range(10):
@@ -32,3 +40,96 @@ def get_dataset_summary_table(file_path):
32
  df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
33
 
34
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  from huggingface_hub import snapshot_download
3
+ import subprocess
4
+ import re
5
+ import os
6
 
7
+ try:
8
+ from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
9
+ except:
10
+ print("local debug: from display.utils")
11
+ from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
12
 
13
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
14
  for i in range(10):
 
40
  df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
41
 
42
  return df
43
+
44
+ def parse_nvidia_smi():
45
+ visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
46
+ if visible_devices is not None:
47
+ gpu_indices = visible_devices.split(',')
48
+ else:
49
+ # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
50
+ result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
51
+ if result.returncode != 0:
52
+ print("Failed to query GPU indices.")
53
+ return []
54
+ gpu_indices = result.stdout.strip().split('\n')
55
+ print(f"gpu_indices: {gpu_indices}")
56
+ gpu_stats = []
57
+
58
+ gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
59
+ gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
60
+
61
+ gpu_name = ""
62
+ for index in gpu_indices:
63
+ result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
64
+ output = result.stdout.strip()
65
+ lines = output.split("\n")
66
+ for line in lines:
67
+ match = gpu_info_pattern.search(line)
68
+ name_match = gpu_name_pattern.search(line)
69
+ gpu_info = {}
70
+ if name_match:
71
+ gpu_name = name_match.group(1).strip()
72
+ if match:
73
+ temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
74
+ gpu_info.update({
75
+ GPU_TEMP: temp,
76
+ GPU_Power: power_usage,
77
+ GPU_Mem: round(mem_usage / 1024, 2),
78
+ GPU_Util: gpu_util
79
+ })
80
+
81
+ if len(gpu_info) >= 4:
82
+ gpu_stats.append(gpu_info)
83
+ print(f"gpu_stats: {gpu_stats}")
84
+ gpu_name = f"{len(gpu_stats)}x{gpu_name}"
85
+ gpu_stats_total = {
86
+ GPU_TEMP: 0,
87
+ GPU_Power: 0,
88
+ GPU_Mem: 0,
89
+ GPU_Util: 0,
90
+ GPU_Name: gpu_name
91
+ }
92
+ for gpu_stat in gpu_stats:
93
+ gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
94
+ gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
95
+ gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
96
+ gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
97
+ gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
98
+ gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
99
+ gpu_stats_total[GPU_Power] /= len(gpu_stats)
100
+ gpu_stats_total[GPU_Util] /= len(gpu_stats)
101
+ return [gpu_stats_total]
102
+
103
+ def monitor_gpus(stop_event, interval, stats_list):
104
+ while not stop_event.is_set():
105
+ gpu_stats = parse_nvidia_smi()
106
+ if gpu_stats:
107
+ stats_list.extend(gpu_stats)
108
+ stop_event.wait(interval)
109
+
110
+ def analyze_gpu_stats(stats_list):
111
+ # Check if the stats_list is empty, and return None if it is
112
+ if not stats_list:
113
+ return None
114
+
115
+ # Initialize dictionaries to store the stats
116
+ avg_stats = {}
117
+ max_stats = {}
118
+
119
+ # Calculate average stats, excluding 'GPU_Mem'
120
+ for key in stats_list[0].keys():
121
+ if key != GPU_Mem and key != GPU_Name:
122
+ total = sum(d[key] for d in stats_list)
123
+ avg_stats[key] = total / len(stats_list)
124
+
125
+ # Calculate max stats for 'GPU_Mem'
126
+ max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
127
+ if GPU_Name in stats_list[0]:
128
+ avg_stats[GPU_Name] = stats_list[0][GPU_Name]
129
+ # Update average stats with max GPU memory usage
130
+ avg_stats.update(max_stats)
131
+
132
+ return avg_stats
133
+
134
+ if __name__ == "__main__":
135
+ print(analyze_gpu_stats(parse_nvidia_smi()))