dropbox-dash
/

lama-3.1-70b-instruct_4bitgs64_hqq

Text Generation

Model card Files Files and versions

mobicham commited on Feb 5

Commit

5978e10

·

verified ·

1 Parent(s): 9349b3e

Update README.md

Files changed (1) hide show

README.md +9 -11

README.md CHANGED Viewed

@@ -56,24 +56,22 @@ from hqq.utils.patching import *
 from hqq.core.quantize import *
 from hqq.utils.generation_hf import HFGenerator
 #Load the model
 ###################################################
 model_id = 'mobiuslabsgmbh/Llama-3.1-70b-instruct_4bitgs64_hqq'
-compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
-cache_dir = '.'
-model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
-quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
-patch_linearlayers(model, patch_add_quant_config, quant_config)
 #Use optimized inference kernels
 ###################################################
-HQQLinear.set_backend(HQQBackend.PYTORCH)
-#prepare_for_inference(model) #default backend
-prepare_for_inference(model, backend="torchao_int4")
-#prepare_for_inference(model, backend="bitblas") #takes a while to init...
 #Generate
 ###################################################

 from hqq.core.quantize import *
 from hqq.utils.generation_hf import HFGenerator
+#Settings
+###################################################
+backend       = "torchao_int4" #'torchao_int4' #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit)
+compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
+device        = 'cuda:0'
+cache_dir     = '.'
 #Load the model
 ###################################################
 model_id = 'mobiuslabsgmbh/Llama-3.1-70b-instruct_4bitgs64_hqq'
+model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype, device=device)
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
 #Use optimized inference kernels
 ###################################################
+prepare_for_inference(model, backend=backend) #takes a while to init...
 #Generate
 ###################################################