File size: 6,341 Bytes
d362eb3
99189b4
d362eb3
a8dc17e
774805b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85e6e1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1593b6e
85e6e1b
1593b6e
85e6e1b
 
 
 
 
 
 
 
 
 
 
 
 
 
a679b46
774805b
a679b46
 
bde8fae
85e6e1b
f8f47b0
a679b46
bde8fae
a679b46
 
 
 
 
 
 
bde8fae
 
774805b
bde8fae
a679b46
774805b
18b7160
2fc7a7e
3525e51
 
2fc7a7e
85e6e1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a679b46
3525e51
f2e31d0
c93cb73
f2e31d0
f8f47b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# import os; os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:1500' # A10
import os; os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:2000' # A100


# import torch
# from typing import  Dict, List, Any
# from transformers import AutoTokenizer, AutoModelForCausalLM

# class EndpointHandler:
#     def __init__(self, path: str = ""):

#         self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side = "left")
#         self.model = AutoModelForCausalLM.from_pretrained(path, device_map = "auto", torch_dtype=torch.float16)

#     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
#         """
#         Args:
#             data (:obj:):
#                 includes the input data and the parameters for the inference.
#         Return:
#             A :obj:`list`:. The list contains the answer and scores of the inference inputs
#         """

#         # process input
#         inputs_dict = data.pop("inputs", data)
#         parameters = data.pop("parameters", {})
        
#         prompts = [f"<human>: {prompt}\n<bot>:" for prompt in inputs_dict]

#         self.tokenizer.pad_token = self.tokenizer.eos_token

#         inputs = self.tokenizer(prompts, truncation=True, max_length=2048-512,
#                                 return_tensors='pt', padding=True).to(self.model.device)
#         input_length = inputs.input_ids.shape[1]

#         if parameters.get("deterministic", False):
#             torch.manual_seed(42)

#         outputs = self.model.generate(
#             **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.7, top_k=50
#          )
        
#         output_strs = self.tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)
        
#         return {"generated_text": output_strs}

# import torch
# from typing import  Dict, List, Any
# from transformers import AutoTokenizer, AutoModelForCausalLM

# class EndpointHandler():
#     def __init__(self, path: str = ""):

#         self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side = "left")
#         self.model = AutoModelForCausalLM.from_pretrained(path, device_map = "auto", torch_dtype=torch.float16)

#     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
#         """
#         Args:
#             data (:obj:):
#                 includes the input data and the parameters for the inference.
#         Return:
#             A :obj:`list`:. The list contains the answer and scores of the inference inputs
#         """

#         # process input
#         inputs_list = data.pop("inputs", data)
#         parameters = data.pop("parameters", {})
        
#         prompts = [f"<human>: {prompt}\n<bot>:" for prompt in inputs_list]

#         self.tokenizer.pad_token = self.tokenizer.eos_token

#         inputs = self.tokenizer(prompts, truncation=True, max_length=2048-512,
#                                 return_tensors='pt', padding=True).to(self.model.device)
#         input_length = inputs.input_ids.shape[1]

#         if parameters.get("deterministic", False):
#             torch.manual_seed(42)

#         outputs = self.model.generate(
#             **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.7, top_k=50
#          )
        
#         output_strs = self.tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)
        
#         return {"generated_text": output_strs}

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
from typing import  Dict, List, Any

class StopWordsCriteria(StoppingCriteria):
    def __init__(self, stop_words, tokenizer):
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self._cache_str = ''

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        self._cache_str += self.tokenizer.decode(input_ids[0, -1])
        for stop_words in self.stop_words:
            if stop_words in self._cache_str:
                return True
        return False


class EndpointHandler():
    def __init__(self, path: str = ""):

        self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side = "left")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(path, device_map = "auto", torch_dtype=torch.float16)

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`list`:. The list contains the answer and scores of the inference inputs
        """

        # process input
        inputs_list = data.pop("inputs", data)
        parameters = data.pop("parameters", {})
        
        prompts = [f"<human>: {prompt}\n<bot>:" for prompt in inputs_list]

        if parameters.get("EXEC", False):
            exec(parameters['EXEC'])
            del parameters['EXEC']

        if parameters.get("preset_truncation_token"):
          preset_truncation_token_value = parameters["preset_truncation_token"]
          DELIMETER = " "
          prompts = [DELIMETER.join(prompt.split(DELIMETER)[:preset_truncation_token_value]) for prompt in prompts]
          del parameters["preset_truncation_token"]

        with torch.no_grad():
          inputs = self.tokenizer(prompts, truncation=True, max_length=2048-512,
                                  return_tensors='pt', padding=True).to(self.model.device)
          input_length = inputs.input_ids.shape[1]

          if parameters.get("deterministic_seed", False):
              torch.manual_seed(parameters["deterministic_seed"])
              del parameters["deterministic_seed"]
          
          outputs = self.model.generate(
              **inputs, **parameters,
              stopping_criteria=StoppingCriteriaList(
                  [StopWordsCriteria(['\n<human>:'], self.tokenizer)]
                  )
          )
        
        output_strs = self.tokenizer.batch_decode(outputs.sequences[:, input_length:], skip_special_tokens=True)
        output_strs = [output_str.replace("\n<human>:", "") for output_str in output_strs]
        
        torch.cuda.empty_cache()
        return {"generated_text": output_strs}