Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

AppleSwing commited on May 10, 2024

Commit

c2dbb45

1 Parent(s): ae99472

Fix bugs in quantization

Files changed (3) hide show

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -315,6 +315,15 @@ class HFLMWithMeasurement(HFLM):
             generation_kwargs.pop("is_gsm8k")
         context_length = context.shape[1]
         if not is_gsm8k:
         # build stopping criteria
@@ -356,8 +365,6 @@ class HFLMWithMeasurement(HFLM):
         model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
         model_size_param = get_model_size(model_info=model_info, precision=self.precision)
-        model_config = self.model.config
         n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
         d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model

             generation_kwargs.pop("is_gsm8k")
         context_length = context.shape[1]
+        model_config = self.model.config
+        if not self.precision:
+            if model_config.quantization_config._load_in_4bit:
+                self.precision = "4bit"
+            elif model_config.quantization_config._load_in_8bit:
+                self.precision = "8bit"
+            else:
+                raise ValueError("Unknown precision")
         if not is_gsm8k:
         # build stopping criteria
         model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
         model_size_param = get_model_size(model_info=model_info, precision=self.precision)
         n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
         d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model

src/backend/tasks/measurement_task_utils.py CHANGED Viewed

@@ -12,6 +12,9 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
@@ -19,6 +22,8 @@ def process_results_decorator(func):
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
+        mfu = sum([r[4] for r in results]) / len(results)
+        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
+        result_dict["mfu"] = mfu
+        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
+        aggregation_list["mfu"] = mean
+        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
+        higher_is_better_dict["mfu"] = True
+        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper

src/utils.py CHANGED Viewed

@@ -98,7 +98,8 @@ def parse_nvidia_smi():
     gpu_stats = []
     gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
-    gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
     gpu_name = ""
     for index in gpu_indices:
@@ -110,7 +111,7 @@ def parse_nvidia_smi():
             name_match = gpu_name_pattern.search(line)
             gpu_info = {}
             if name_match:
-                gpu_name = name_match.group(1).strip()
             if match:
                 temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                 gpu_info.update({

     gpu_stats = []
     gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
+    # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
+    gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
     gpu_name = ""
     for index in gpu_indices:
             name_match = gpu_name_pattern.search(line)
             gpu_info = {}
             if name_match:
+                gpu_name = ''.join(filter(None, name_match.groups())).strip()
             if match:
                 temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                 gpu_info.update({