Tonic commited on
Commit
435433d
·
1 Parent(s): 5d56e20

adds flash attention 2 to attention implementation

Browse files
Files changed (1) hide show
  1. app.py +1 -0
app.py CHANGED
@@ -119,6 +119,7 @@ def load_model():
119
  "torch_dtype": torch.float16 if DEVICE == "cuda" else torch.float32, # Use float16 on GPU, float32 on CPU
120
  "trust_remote_code": True,
121
  "low_cpu_mem_usage": True,
 
122
  }
123
 
124
  logger.info(f"Model loading parameters: {model_kwargs}")
 
119
  "torch_dtype": torch.float16 if DEVICE == "cuda" else torch.float32, # Use float16 on GPU, float32 on CPU
120
  "trust_remote_code": True,
121
  "low_cpu_mem_usage": True,
122
+ "attention_implementation": "flash_attention_2"
123
  }
124
 
125
  logger.info(f"Model loading parameters: {model_kwargs}")