Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

Tonic commited on Jul 30

Commit

435433d

1 Parent(s): 5d56e20

adds flash attention 2 to attention implementation

Files changed (1) hide show

app.py CHANGED Viewed

@@ -119,6 +119,7 @@ def load_model():
             "torch_dtype": torch.float16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
         }
         logger.info(f"Model loading parameters: {model_kwargs}")

             "torch_dtype": torch.float16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
+            "attention_implementation": "flash_attention_2"
         }
         logger.info(f"Model loading parameters: {model_kwargs}")