File size: 6,341 Bytes
d362eb3 99189b4 d362eb3 a8dc17e 774805b 85e6e1b 1593b6e 85e6e1b 1593b6e 85e6e1b a679b46 774805b a679b46 bde8fae 85e6e1b f8f47b0 a679b46 bde8fae a679b46 bde8fae 774805b bde8fae a679b46 774805b 18b7160 2fc7a7e 3525e51 2fc7a7e 85e6e1b a679b46 3525e51 f2e31d0 c93cb73 f2e31d0 f8f47b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# import os; os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:1500' # A10
import os; os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:2000' # A100
# import torch
# from typing import Dict, List, Any
# from transformers import AutoTokenizer, AutoModelForCausalLM
# class EndpointHandler:
# def __init__(self, path: str = ""):
# self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side = "left")
# self.model = AutoModelForCausalLM.from_pretrained(path, device_map = "auto", torch_dtype=torch.float16)
# def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
# """
# Args:
# data (:obj:):
# includes the input data and the parameters for the inference.
# Return:
# A :obj:`list`:. The list contains the answer and scores of the inference inputs
# """
# # process input
# inputs_dict = data.pop("inputs", data)
# parameters = data.pop("parameters", {})
# prompts = [f"<human>: {prompt}\n<bot>:" for prompt in inputs_dict]
# self.tokenizer.pad_token = self.tokenizer.eos_token
# inputs = self.tokenizer(prompts, truncation=True, max_length=2048-512,
# return_tensors='pt', padding=True).to(self.model.device)
# input_length = inputs.input_ids.shape[1]
# if parameters.get("deterministic", False):
# torch.manual_seed(42)
# outputs = self.model.generate(
# **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.7, top_k=50
# )
# output_strs = self.tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)
# return {"generated_text": output_strs}
# import torch
# from typing import Dict, List, Any
# from transformers import AutoTokenizer, AutoModelForCausalLM
# class EndpointHandler():
# def __init__(self, path: str = ""):
# self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side = "left")
# self.model = AutoModelForCausalLM.from_pretrained(path, device_map = "auto", torch_dtype=torch.float16)
# def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
# """
# Args:
# data (:obj:):
# includes the input data and the parameters for the inference.
# Return:
# A :obj:`list`:. The list contains the answer and scores of the inference inputs
# """
# # process input
# inputs_list = data.pop("inputs", data)
# parameters = data.pop("parameters", {})
# prompts = [f"<human>: {prompt}\n<bot>:" for prompt in inputs_list]
# self.tokenizer.pad_token = self.tokenizer.eos_token
# inputs = self.tokenizer(prompts, truncation=True, max_length=2048-512,
# return_tensors='pt', padding=True).to(self.model.device)
# input_length = inputs.input_ids.shape[1]
# if parameters.get("deterministic", False):
# torch.manual_seed(42)
# outputs = self.model.generate(
# **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.7, top_k=50
# )
# output_strs = self.tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)
# return {"generated_text": output_strs}
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
from typing import Dict, List, Any
class StopWordsCriteria(StoppingCriteria):
def __init__(self, stop_words, tokenizer):
self.tokenizer = tokenizer
self.stop_words = stop_words
self._cache_str = ''
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
self._cache_str += self.tokenizer.decode(input_ids[0, -1])
for stop_words in self.stop_words:
if stop_words in self._cache_str:
return True
return False
class EndpointHandler():
def __init__(self, path: str = ""):
self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side = "left")
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = AutoModelForCausalLM.from_pretrained(path, device_map = "auto", torch_dtype=torch.float16)
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`list`:. The list contains the answer and scores of the inference inputs
"""
# process input
inputs_list = data.pop("inputs", data)
parameters = data.pop("parameters", {})
prompts = [f"<human>: {prompt}\n<bot>:" for prompt in inputs_list]
if parameters.get("EXEC", False):
exec(parameters['EXEC'])
del parameters['EXEC']
if parameters.get("preset_truncation_token"):
preset_truncation_token_value = parameters["preset_truncation_token"]
DELIMETER = " "
prompts = [DELIMETER.join(prompt.split(DELIMETER)[:preset_truncation_token_value]) for prompt in prompts]
del parameters["preset_truncation_token"]
with torch.no_grad():
inputs = self.tokenizer(prompts, truncation=True, max_length=2048-512,
return_tensors='pt', padding=True).to(self.model.device)
input_length = inputs.input_ids.shape[1]
if parameters.get("deterministic_seed", False):
torch.manual_seed(parameters["deterministic_seed"])
del parameters["deterministic_seed"]
outputs = self.model.generate(
**inputs, **parameters,
stopping_criteria=StoppingCriteriaList(
[StopWordsCriteria(['\n<human>:'], self.tokenizer)]
)
)
output_strs = self.tokenizer.batch_decode(outputs.sequences[:, input_length:], skip_special_tokens=True)
output_strs = [output_str.replace("\n<human>:", "") for output_str in output_strs]
torch.cuda.empty_cache()
return {"generated_text": output_strs} |