mobicham commited on
Commit
5978e10
·
verified ·
1 Parent(s): 9349b3e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -11
README.md CHANGED
@@ -56,24 +56,22 @@ from hqq.utils.patching import *
56
  from hqq.core.quantize import *
57
  from hqq.utils.generation_hf import HFGenerator
58
 
 
 
 
 
 
 
 
59
  #Load the model
60
  ###################################################
61
  model_id = 'mobiuslabsgmbh/Llama-3.1-70b-instruct_4bitgs64_hqq'
62
-
63
- compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
64
- cache_dir = '.'
65
- model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
66
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
67
 
68
- quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
69
- patch_linearlayers(model, patch_add_quant_config, quant_config)
70
-
71
  #Use optimized inference kernels
72
  ###################################################
73
- HQQLinear.set_backend(HQQBackend.PYTORCH)
74
- #prepare_for_inference(model) #default backend
75
- prepare_for_inference(model, backend="torchao_int4")
76
- #prepare_for_inference(model, backend="bitblas") #takes a while to init...
77
 
78
  #Generate
79
  ###################################################
 
56
  from hqq.core.quantize import *
57
  from hqq.utils.generation_hf import HFGenerator
58
 
59
+ #Settings
60
+ ###################################################
61
+ backend = "torchao_int4" #'torchao_int4' #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit)
62
+ compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
63
+ device = 'cuda:0'
64
+ cache_dir = '.'
65
+
66
  #Load the model
67
  ###################################################
68
  model_id = 'mobiuslabsgmbh/Llama-3.1-70b-instruct_4bitgs64_hqq'
69
+ model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype, device=device)
 
 
 
70
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
71
 
 
 
 
72
  #Use optimized inference kernels
73
  ###################################################
74
+ prepare_for_inference(model, backend=backend) #takes a while to init...
 
 
 
75
 
76
  #Generate
77
  ###################################################