jed-tiotuico commited on
Commit
63a675b
·
1 Parent(s): aa40bb1
Files changed (1) hide show
  1. handler.py +21 -57
handler.py CHANGED
@@ -1,64 +1,28 @@
1
- # handler.py
2
- from peft import AutoPeftModelForCausalLM
3
- from transformers import AutoTokenizer
4
- from typing import Dict, List, Any
 
 
 
5
 
6
  class EndpointHandler:
7
  def __init__(self, path=""):
8
- # Initialize and load the model using a Transformers pipeline
9
- # Ensure that the model and tokenizer are placed correctly in the specified path
10
- # self.model = PreTrainedModel.from_pretrained(path).to(device)
11
- # self.tokenizer = PreTrainedTokenizer.from_pretrained(path)
12
- # max_seq_length = 1024
13
- # dtype = None
14
- load_in_4bit = True
15
- # self.model, self.tokenizer = FastLanguageModel.from_pretrained(
16
- # model_name=path,
17
- # max_seq_length=max_seq_length,
18
- # dtype=dtype,
19
- # load_in_4bit=load_in_4bit
20
- # )
21
- # FastLanguageModel.for_inference(self.model)
22
-
23
- from transformers import AutoTokenizer
24
- self.model = AutoPeftModelForCausalLM.from_pretrained(
25
- path,
26
- load_in_4bit = load_in_4bit,
27
  )
28
- self.tokenizer = AutoTokenizer.from_pretrained(path)
29
 
30
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
31
- """
32
- Handles incoming requests to the inference endpoint.
33
-
34
- Args:
35
- data (Dict[str, Any]): The request payload containing 'inputs' and any other necessary parameters.
36
 
37
- Returns:
38
- List[Dict[str, Any]]: The prediction results formatted as a list of dictionaries.
39
- """
40
- # encode the input text
41
- # inputs = self.tokenizer([
42
- # """<s>\nQ:\nI am having problems with my computer. It is not turning on. What should I do?\n"A:\n"""
43
- # ], return_tensors="pt").to("cuda")
44
- # # generate text based on the input
45
- # outputs = self.model.generate(**inputs, max_new_tokens=1000, use_cache=True)
46
- # result = self.tokenizer.batch_decode(outputs)
47
- # return [{"generated_text": result[0]}]
48
-
49
- # encode the input text
50
- inputs = self.tokenizer.encode(
51
- text="\n<s>\nQ:\nI am having problems with my computer. It is not turning on. What should I do?\nA:\n",
52
- return_tensors="pt",
53
- max_length=1024,
54
- truncation=True,
55
- padding=True,
56
- ).to(self.model.device)
57
- outputs = self.model.generate(
58
- inputs,
59
- max_length=1024,
60
- num_return_sequences=1,
61
- use_cache=True,
62
- )
63
- result = self.tokenizer.batch_decode(outputs)
64
- return [{"generated_text": result}]
 
1
+ from typing import Dict, Any, List
2
+ from unsloth import FastLanguageModel
3
+ import torch
4
+
5
+ max_seq_length = 2048
6
+ dtype = None
7
+ load_in_4bit = True
8
 
9
  class EndpointHandler:
10
  def __init__(self, path=""):
11
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(
12
+ model_name = path, # YOUR MODEL YOU USED FOR TRAINING
13
+ max_seq_length = max_seq_length,
14
+ dtype = dtype,
15
+ load_in_4bit = load_in_4bit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
+ FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference
18
 
19
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
20
+ inputs = self.tokenizer([
21
+ """Q:
22
+ I am having problems with my computer. It is not turning on. What should I do?
23
+ A:"""
24
+ ], return_tensors = "pt").to("cuda")
25
 
26
+ outputs = self.model.generate(**inputs, max_new_tokens = 1000, use_cache = True)
27
+ results = self.tokenizer.batch_decode(outputs)
28
+ return results