Andrewwwwww commited on
Commit
73d8c0b
·
verified ·
1 Parent(s): 32e427b

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler1.py +48 -0
  2. requirements.txt +6 -0
handler1.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code to inference Hermes with HF Transformers
2
+ # Requires pytorch, transformers, bitsandbytes, sentencepiece, protobuf, and flash-attn packages
3
+
4
+ import torch
5
+ #from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from transformers import LlamaTokenizer, MixtralForCausalLM
7
+ import bitsandbytes, flash_attn
8
+
9
+ class EndpointHandler:
10
+ def __init__(self, path=""):
11
+ self.tokenizer = LlamaTokenizer.from_pretrained(path, trust_remote_code=True)
12
+ self.model = MixtralForCausalLM.from_pretrained(
13
+ path,
14
+ torch_dtype=torch.float16,
15
+ device_map="auto",
16
+ load_in_8bit=False,
17
+ load_in_4bit=True,
18
+ use_flash_attention_2=True
19
+ )
20
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
21
+ sys_prompt=data["prompt"]
22
+ list=data["inputs"]
23
+ prompt=f"<|im_start|>system\n{sys_prompt}.<|im_end|>\n"
24
+ for item in list:
25
+ if item["role"]=="assistant":
26
+ content=item["content"]
27
+ prompt+=f"<|im_start|>assistant\n{content}<|im_end|>\n"
28
+ else:
29
+ content=item["content"]
30
+ prompt+=f"<|im_start|>user\n{content}<|im_end|>\n"
31
+ prompt+="<|im_start|>assistant\n"
32
+
33
+ #for chat in prompts:
34
+ #print(chat)
35
+ input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
36
+ generated_ids = self.model.generate(input_ids, max_new_tokens=750, temperature=0.8, repetition_penalty=1.1, do_sample=True, eos_token_id=self.tokenizer.eos_token_id)
37
+ response = self.tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
38
+ print(f"Response: {response}")
39
+
40
+ """
41
+ encodeds = self.tokenizer.encode(prompt, return_tensors="pt")
42
+ model_inputs = encodeds.to(device)
43
+ self.model.to(device)
44
+ generated_ids = self.model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
45
+ decoded = self.tokenizer.decode(generated_ids[0])
46
+ return decoded
47
+ """
48
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pytorch
2
+ transformers
3
+ bitsandbytes
4
+ sentencepiece
5
+ protobuf
6
+ flash-attn