Andrewwwwww commited on
Commit
37c8faa
1 Parent(s): 22b1752

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +24 -10
handler.py CHANGED
@@ -1,15 +1,22 @@
1
- from typing import Dict, List, Any
2
- import torch
3
- from modelscope import AutoTokenizer
4
- from modelscope import AutoModelForCausalLM
5
 
6
- device = "cuda" # the device to load the model onto
 
 
 
7
 
8
  class EndpointHandler:
9
  def __init__(self, path=""):
10
- self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device)
11
- self.tokenizer = AutoTokenizer.from_pretrained(path)
12
-
 
 
 
 
 
 
13
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
14
  sys_prompt=data["prompt"]
15
  list=data["inputs"]
@@ -23,12 +30,19 @@ class EndpointHandler:
23
  prompt+=f"<|im_start|>user\n{content}<|im_end|>\n"
24
  prompt+="<|im_start|>assistant\n"
25
 
 
 
 
 
 
 
 
 
26
  encodeds = self.tokenizer.encode(prompt, return_tensors="pt")
27
  model_inputs = encodeds.to(device)
28
  self.model.to(device)
29
  generated_ids = self.model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
30
  decoded = self.tokenizer.decode(generated_ids[0])
31
  return decoded
32
-
33
-
34
 
 
1
+ # Code to inference Hermes with HF Transformers
2
+ # Requires pytorch, transformers, bitsandbytes, sentencepiece, protobuf, and flash-attn packages
 
 
3
 
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from transformers import LlamaTokenizer, MixtralForCausalLM
7
+ import bitsandbytes, flash_attn
8
 
9
  class EndpointHandler:
10
  def __init__(self, path=""):
11
+ self.tokenizer = LlamaTokenizer.from_pretrained(path, trust_remote_code=True)
12
+ self.model = MixtralForCausalLM.from_pretrained(
13
+ path,
14
+ torch_dtype=torch.float16,
15
+ device_map="auto",
16
+ load_in_8bit=False,
17
+ load_in_4bit=True,
18
+ use_flash_attention_2=True
19
+ )
20
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
21
  sys_prompt=data["prompt"]
22
  list=data["inputs"]
 
30
  prompt+=f"<|im_start|>user\n{content}<|im_end|>\n"
31
  prompt+="<|im_start|>assistant\n"
32
 
33
+ #for chat in prompts:
34
+ #print(chat)
35
+ input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
36
+ generated_ids = self.model.generate(input_ids, max_new_tokens=750, temperature=0.8, repetition_penalty=1.1, do_sample=True, eos_token_id=self.tokenizer.eos_token_id)
37
+ response = self.tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
38
+ return response
39
+
40
+ """
41
  encodeds = self.tokenizer.encode(prompt, return_tensors="pt")
42
  model_inputs = encodeds.to(device)
43
  self.model.to(device)
44
  generated_ids = self.model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
45
  decoded = self.tokenizer.decode(generated_ids[0])
46
  return decoded
47
+ """
 
48