Michael Brunzel commited on
Commit
ec2cda7
1 Parent(s): d12b838

Add Flash Attention 2

Browse files
Files changed (1) hide show
  1. handler.py +6 -1
handler.py CHANGED
@@ -36,7 +36,12 @@ class EndpointHandler:
36
  # load model and processor from path
37
  self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
38
  # attn_implementation="flash_attention_2"
39
- self.model = AutoPeftModelForCausalLM.from_pretrained("MichaelAI23/mistral_7B_v0_2_Textmarker", device_map="auto") # load_in_4bit=True
 
 
 
 
 
40
 
41
  self.template = {
42
  "prompt_input": """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n""",
 
36
  # load model and processor from path
37
  self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
38
  # attn_implementation="flash_attention_2"
39
+ self.model = AutoPeftModelForCausalLM.from_pretrained(
40
+ "MichaelAI23/mistral_7B_v0_2_Textmarker",
41
+ device_map="auto",
42
+ torch_dtype=torch.bfloat16,
43
+ attn_implementation="flash_attention_2"
44
+ ) # load_in_4bit=True
45
 
46
  self.template = {
47
  "prompt_input": """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n""",