Text Generation
Transformers
Safetensors
English
llama
text-generation-inference
4-bit precision
gptq

fix documentation for loading the model, since the fused attention module doesnt work here either.

#4
by mber - opened
Files changed (1) hide show
  1. README.md +1 -0
README.md CHANGED
@@ -122,6 +122,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
122
 
123
  model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
124
  model_basename=model_basename,
 
125
  use_safetensors=True,
126
  trust_remote_code=False,
127
  device="cuda:0",
 
122
 
123
  model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
124
  model_basename=model_basename,
125
+ inject_fused_attention=False, # Required for TheBloke/FreeWilly2-GPTQ model at this time.
126
  use_safetensors=True,
127
  trust_remote_code=False,
128
  device="cuda:0",