Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						c2dbb45
	
1
								Parent(s):
							
							ae99472
								
Fix bugs in quantization
Browse files
    	
        src/backend/hflm_with_measurement.py
    CHANGED
    
    | 
         @@ -315,6 +315,15 @@ class HFLMWithMeasurement(HFLM): 
     | 
|
| 315 | 
         
             
                        generation_kwargs.pop("is_gsm8k")
         
     | 
| 316 | 
         | 
| 317 | 
         
             
                    context_length = context.shape[1]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 318 | 
         | 
| 319 | 
         
             
                    if not is_gsm8k:
         
     | 
| 320 | 
         
             
                    # build stopping criteria
         
     | 
| 
         @@ -356,8 +365,6 @@ class HFLMWithMeasurement(HFLM): 
     | 
|
| 356 | 
         | 
| 357 | 
         
             
                    model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
         
     | 
| 358 | 
         
             
                    model_size_param = get_model_size(model_info=model_info, precision=self.precision)
         
     | 
| 359 | 
         
            -
                    
         
     | 
| 360 | 
         
            -
                    model_config = self.model.config
         
     | 
| 361 | 
         | 
| 362 | 
         
             
                    n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
         
     | 
| 363 | 
         
             
                    d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
         
     | 
| 
         | 
|
| 315 | 
         
             
                        generation_kwargs.pop("is_gsm8k")
         
     | 
| 316 | 
         | 
| 317 | 
         
             
                    context_length = context.shape[1]
         
     | 
| 318 | 
         
            +
                    model_config = self.model.config
         
     | 
| 319 | 
         
            +
             
     | 
| 320 | 
         
            +
                    if not self.precision:
         
     | 
| 321 | 
         
            +
                        if model_config.quantization_config._load_in_4bit:
         
     | 
| 322 | 
         
            +
                            self.precision = "4bit"
         
     | 
| 323 | 
         
            +
                        elif model_config.quantization_config._load_in_8bit:
         
     | 
| 324 | 
         
            +
                            self.precision = "8bit"
         
     | 
| 325 | 
         
            +
                        else:
         
     | 
| 326 | 
         
            +
                            raise ValueError("Unknown precision")
         
     | 
| 327 | 
         | 
| 328 | 
         
             
                    if not is_gsm8k:
         
     | 
| 329 | 
         
             
                    # build stopping criteria
         
     | 
| 
         | 
|
| 365 | 
         | 
| 366 | 
         
             
                    model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
         
     | 
| 367 | 
         
             
                    model_size_param = get_model_size(model_info=model_info, precision=self.precision)
         
     | 
| 
         | 
|
| 
         | 
|
| 368 | 
         | 
| 369 | 
         
             
                    n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
         
     | 
| 370 | 
         
             
                    d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
         
     | 
    	
        src/backend/tasks/measurement_task_utils.py
    CHANGED
    
    | 
         @@ -12,6 +12,9 @@ def process_results_decorator(func): 
     | 
|
| 12 | 
         
             
                    end_to_end_time = sum([r[1] for r in results]) / len(results)
         
     | 
| 13 | 
         
             
                    prefilling_time = sum([r[2] for r in results]) / len(results)
         
     | 
| 14 | 
         
             
                    decoding_throughput = sum([r[3] for r in results]) / len(results)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 15 | 
         
             
                    # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         
     | 
| 16 | 
         | 
| 17 | 
         
             
                    # Now call the original process_results with the processed results
         
     | 
| 
         @@ -19,6 +22,8 @@ def process_results_decorator(func): 
     | 
|
| 19 | 
         
             
                    result_dict["end_to_end_time"] = end_to_end_time
         
     | 
| 20 | 
         
             
                    result_dict["prefilling_time"] = prefilling_time
         
     | 
| 21 | 
         
             
                    result_dict["decoding_throughput"] = decoding_throughput
         
     | 
| 
         | 
|
| 
         | 
|
| 22 | 
         
             
                    return result_dict
         
     | 
| 23 | 
         
             
                return wrapper
         
     | 
| 24 | 
         | 
| 
         @@ -30,6 +35,8 @@ def aggregation_decorator(func): 
     | 
|
| 30 | 
         
             
                    aggregation_list["end_to_end_time"] = mean
         
     | 
| 31 | 
         
             
                    aggregation_list["prefilling_time"] = mean
         
     | 
| 32 | 
         
             
                    aggregation_list["decoding_throughput"] = mean
         
     | 
| 
         | 
|
| 
         | 
|
| 33 | 
         
             
                    return aggregation_list
         
     | 
| 34 | 
         
             
                return wrapper
         
     | 
| 35 | 
         | 
| 
         @@ -41,6 +48,8 @@ def higher_is_better_decorator(func): 
     | 
|
| 41 | 
         
             
                    higher_is_better_dict["end_to_end_time"] = False
         
     | 
| 42 | 
         
             
                    higher_is_better_dict["prefilling_time"] = False
         
     | 
| 43 | 
         
             
                    higher_is_better_dict["decoding_throughput"] = True
         
     | 
| 
         | 
|
| 
         | 
|
| 44 | 
         
             
                    return higher_is_better_dict
         
     | 
| 45 | 
         
             
                return wrapper
         
     | 
| 46 | 
         | 
| 
         | 
|
| 12 | 
         
             
                    end_to_end_time = sum([r[1] for r in results]) / len(results)
         
     | 
| 13 | 
         
             
                    prefilling_time = sum([r[2] for r in results]) / len(results)
         
     | 
| 14 | 
         
             
                    decoding_throughput = sum([r[3] for r in results]) / len(results)
         
     | 
| 15 | 
         
            +
                    mfu = sum([r[4] for r in results]) / len(results)
         
     | 
| 16 | 
         
            +
                    mbu = sum([r[5] for r in results]) / len(results)
         
     | 
| 17 | 
         
            +
                    
         
     | 
| 18 | 
         
             
                    # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                    # Now call the original process_results with the processed results
         
     | 
| 
         | 
|
| 22 | 
         
             
                    result_dict["end_to_end_time"] = end_to_end_time
         
     | 
| 23 | 
         
             
                    result_dict["prefilling_time"] = prefilling_time
         
     | 
| 24 | 
         
             
                    result_dict["decoding_throughput"] = decoding_throughput
         
     | 
| 25 | 
         
            +
                    result_dict["mfu"] = mfu
         
     | 
| 26 | 
         
            +
                    result_dict["mbu"] = mbu
         
     | 
| 27 | 
         
             
                    return result_dict
         
     | 
| 28 | 
         
             
                return wrapper
         
     | 
| 29 | 
         | 
| 
         | 
|
| 35 | 
         
             
                    aggregation_list["end_to_end_time"] = mean
         
     | 
| 36 | 
         
             
                    aggregation_list["prefilling_time"] = mean
         
     | 
| 37 | 
         
             
                    aggregation_list["decoding_throughput"] = mean
         
     | 
| 38 | 
         
            +
                    aggregation_list["mfu"] = mean
         
     | 
| 39 | 
         
            +
                    aggregation_list["mbu"] = mean
         
     | 
| 40 | 
         
             
                    return aggregation_list
         
     | 
| 41 | 
         
             
                return wrapper
         
     | 
| 42 | 
         | 
| 
         | 
|
| 48 | 
         
             
                    higher_is_better_dict["end_to_end_time"] = False
         
     | 
| 49 | 
         
             
                    higher_is_better_dict["prefilling_time"] = False
         
     | 
| 50 | 
         
             
                    higher_is_better_dict["decoding_throughput"] = True
         
     | 
| 51 | 
         
            +
                    higher_is_better_dict["mfu"] = True
         
     | 
| 52 | 
         
            +
                    higher_is_better_dict["mbu"] = True
         
     | 
| 53 | 
         
             
                    return higher_is_better_dict
         
     | 
| 54 | 
         
             
                return wrapper
         
     | 
| 55 | 
         | 
    	
        src/utils.py
    CHANGED
    
    | 
         @@ -98,7 +98,8 @@ def parse_nvidia_smi(): 
     | 
|
| 98 | 
         
             
                gpu_stats = []
         
     | 
| 99 | 
         | 
| 100 | 
         
             
                gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
         
     | 
| 101 | 
         
            -
                gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
         
     | 
| 
         | 
|
| 102 | 
         | 
| 103 | 
         
             
                gpu_name = ""
         
     | 
| 104 | 
         
             
                for index in gpu_indices:
         
     | 
| 
         @@ -110,7 +111,7 @@ def parse_nvidia_smi(): 
     | 
|
| 110 | 
         
             
                        name_match = gpu_name_pattern.search(line)
         
     | 
| 111 | 
         
             
                        gpu_info = {}
         
     | 
| 112 | 
         
             
                        if name_match:
         
     | 
| 113 | 
         
            -
                            gpu_name = name_match. 
     | 
| 114 | 
         
             
                        if match:
         
     | 
| 115 | 
         
             
                            temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
         
     | 
| 116 | 
         
             
                            gpu_info.update({
         
     | 
| 
         | 
|
| 98 | 
         
             
                gpu_stats = []
         
     | 
| 99 | 
         | 
| 100 | 
         
             
                gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
         
     | 
| 101 | 
         
            +
                # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
         
     | 
| 102 | 
         
            +
                gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
         
     | 
| 103 | 
         | 
| 104 | 
         
             
                gpu_name = ""
         
     | 
| 105 | 
         
             
                for index in gpu_indices:
         
     | 
| 
         | 
|
| 111 | 
         
             
                        name_match = gpu_name_pattern.search(line)
         
     | 
| 112 | 
         
             
                        gpu_info = {}
         
     | 
| 113 | 
         
             
                        if name_match:
         
     | 
| 114 | 
         
            +
                            gpu_name = ''.join(filter(None, name_match.groups())).strip()
         
     | 
| 115 | 
         
             
                        if match:
         
     | 
| 116 | 
         
             
                            temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
         
     | 
| 117 | 
         
             
                            gpu_info.update({
         
     |