mogaio commited on
Commit
db91f60
1 Parent(s): 65e5eaa

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +11 -20
handler.py CHANGED
@@ -22,7 +22,7 @@ class EndpointHandler:
22
  self.model = AutoModelForCausalLM.from_pretrained(
23
  base_model_name,
24
  quantization_config=self.bnb_config,
25
- device_map="auto", # Auto selects device to put model on.
26
  )
27
  self.model.config.use_cache = False
28
 
@@ -31,23 +31,14 @@ class EndpointHandler:
31
 
32
 
33
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
34
- INTRO = "Below is a conversation between a user and you."
35
- END = "Instruction: Write a response appropriate to the conversation."
36
- prompt = "<user>:"
37
-
38
- # process input
39
- inputs = data.pop("inputs", data)
40
- parameters = data.pop("parameters", None)
41
-
42
- prompt = prompt+inputs
43
- # preprocess
44
-
45
- device = "cuda" if torch.cuda.is_available() else "cpu"
46
-
47
- inputs = self.tokenizer(INTRO+'\n '+prompt+'\n '+END +'\n <assistant>:', return_tensors="pt").to(device)
48
-
49
- inputs = {k: v.to('cuda') for k, v in inputs.items()}
50
- output = self.inference_model.generate(input_ids=inputs["input_ids"],pad_token_id=self.tokenizer.pad_token_id, max_new_tokens=100, do_sample=True, temperature=0.1, top_p=0.9, repetition_penalty=1.5)
51
- reply = self.tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=True)
52
 
53
- return [{"generated_reply": reply}]
 
22
  self.model = AutoModelForCausalLM.from_pretrained(
23
  base_model_name,
24
  quantization_config=self.bnb_config,
25
+ device_map="auto",
26
  )
27
  self.model.config.use_cache = False
28
 
 
31
 
32
 
33
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
34
+ output = self.inference_model.generate(input_ids=inputs["input_ids"],pad_token_id=self.tokenizer.pad_token_id, max_new_tokens=256, do_sample=True, temperature=0.9, top_p=0.9, repetition_penalty=1.5, early_stopping=True, length_penalty = -0.3, num_beams=5, num_return_sequences=1)
35
+ response_raw = self.tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=True)
36
+ response_ls = response_raw[0].split('>>')
37
+ response_ = response_ls[1].split('<assistant>:')[1]
38
+ response_ = response_.split('<user>')[0]
39
+ response_ = response_.split('Instruction:')[0]
40
+
41
+ response_ = response_.replace('\n','')
42
+ response = '<assistant>:' + response_.strip()
 
 
 
 
 
 
 
 
 
43
 
44
+ return [{"generated_reply": response}]