AppleSwing commited on
Commit
c2dbb45
1 Parent(s): ae99472

Fix bugs in quantization

Browse files
src/backend/hflm_with_measurement.py CHANGED
@@ -315,6 +315,15 @@ class HFLMWithMeasurement(HFLM):
315
  generation_kwargs.pop("is_gsm8k")
316
 
317
  context_length = context.shape[1]
 
 
 
 
 
 
 
 
 
318
 
319
  if not is_gsm8k:
320
  # build stopping criteria
@@ -356,8 +365,6 @@ class HFLMWithMeasurement(HFLM):
356
 
357
  model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
358
  model_size_param = get_model_size(model_info=model_info, precision=self.precision)
359
-
360
- model_config = self.model.config
361
 
362
  n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
363
  d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
 
315
  generation_kwargs.pop("is_gsm8k")
316
 
317
  context_length = context.shape[1]
318
+ model_config = self.model.config
319
+
320
+ if not self.precision:
321
+ if model_config.quantization_config._load_in_4bit:
322
+ self.precision = "4bit"
323
+ elif model_config.quantization_config._load_in_8bit:
324
+ self.precision = "8bit"
325
+ else:
326
+ raise ValueError("Unknown precision")
327
 
328
  if not is_gsm8k:
329
  # build stopping criteria
 
365
 
366
  model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
367
  model_size_param = get_model_size(model_info=model_info, precision=self.precision)
 
 
368
 
369
  n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
370
  d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
src/backend/tasks/measurement_task_utils.py CHANGED
@@ -12,6 +12,9 @@ def process_results_decorator(func):
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
 
15
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
16
 
17
  # Now call the original process_results with the processed results
@@ -19,6 +22,8 @@ def process_results_decorator(func):
19
  result_dict["end_to_end_time"] = end_to_end_time
20
  result_dict["prefilling_time"] = prefilling_time
21
  result_dict["decoding_throughput"] = decoding_throughput
 
 
22
  return result_dict
23
  return wrapper
24
 
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
30
  aggregation_list["end_to_end_time"] = mean
31
  aggregation_list["prefilling_time"] = mean
32
  aggregation_list["decoding_throughput"] = mean
 
 
33
  return aggregation_list
34
  return wrapper
35
 
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
41
  higher_is_better_dict["end_to_end_time"] = False
42
  higher_is_better_dict["prefilling_time"] = False
43
  higher_is_better_dict["decoding_throughput"] = True
 
 
44
  return higher_is_better_dict
45
  return wrapper
46
 
 
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
15
+ mfu = sum([r[4] for r in results]) / len(results)
16
+ mbu = sum([r[5] for r in results]) / len(results)
17
+
18
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
19
 
20
  # Now call the original process_results with the processed results
 
22
  result_dict["end_to_end_time"] = end_to_end_time
23
  result_dict["prefilling_time"] = prefilling_time
24
  result_dict["decoding_throughput"] = decoding_throughput
25
+ result_dict["mfu"] = mfu
26
+ result_dict["mbu"] = mbu
27
  return result_dict
28
  return wrapper
29
 
 
35
  aggregation_list["end_to_end_time"] = mean
36
  aggregation_list["prefilling_time"] = mean
37
  aggregation_list["decoding_throughput"] = mean
38
+ aggregation_list["mfu"] = mean
39
+ aggregation_list["mbu"] = mean
40
  return aggregation_list
41
  return wrapper
42
 
 
48
  higher_is_better_dict["end_to_end_time"] = False
49
  higher_is_better_dict["prefilling_time"] = False
50
  higher_is_better_dict["decoding_throughput"] = True
51
+ higher_is_better_dict["mfu"] = True
52
+ higher_is_better_dict["mbu"] = True
53
  return higher_is_better_dict
54
  return wrapper
55
 
src/utils.py CHANGED
@@ -98,7 +98,8 @@ def parse_nvidia_smi():
98
  gpu_stats = []
99
 
100
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
101
- gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
 
102
 
103
  gpu_name = ""
104
  for index in gpu_indices:
@@ -110,7 +111,7 @@ def parse_nvidia_smi():
110
  name_match = gpu_name_pattern.search(line)
111
  gpu_info = {}
112
  if name_match:
113
- gpu_name = name_match.group(1).strip()
114
  if match:
115
  temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
116
  gpu_info.update({
 
98
  gpu_stats = []
99
 
100
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
101
+ # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
102
+ gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
103
 
104
  gpu_name = ""
105
  for index in gpu_indices:
 
111
  name_match = gpu_name_pattern.search(line)
112
  gpu_info = {}
113
  if name_match:
114
+ gpu_name = ''.join(filter(None, name_match.groups())).strip()
115
  if match:
116
  temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
117
  gpu_info.update({