Spaces:
Runtime error
Runtime error
from typing import List, Tuple | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from transformers.generation.logits_process import LogitsProcessor | |
from transformers.generation.utils import LogitsProcessorList | |
DEFAULT_MODEL_PATH = "THUDM/chatglm2-6b" | |
class InvalidScoreLogitsProcessor(LogitsProcessor): | |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: | |
if torch.isnan(scores).any() or torch.isinf(scores).any(): | |
scores.zero_() | |
scores[..., 5] = 5e4 | |
return scores | |
class ChatGLM2(object): | |
def __init__(self, model_path=None): | |
if not model_path: | |
self.model_path = DEFAULT_MODEL_PATH | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) | |
model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True).half().cuda() | |
self.model = model.eval() | |
def generate( | |
self, | |
prompt: str, | |
do_sample: bool = True, | |
max_length: int = 8192, | |
num_beams: int = 1, | |
temperature: float = 0.8, | |
top_p: float = 0.8, | |
): | |
logits_processor = LogitsProcessorList() | |
logits_processor.append(InvalidScoreLogitsProcessor()) | |
gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, | |
"temperature": temperature, "logits_processor": logits_processor} | |
inputs = self.tokenizer([prompt], return_tensors="pt") | |
inputs = inputs.to(self.model.device) | |
outputs = self.model.generate(**inputs, **gen_kwargs) | |
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):] | |
response = self.tokenizer.decode(outputs) | |
response = self.model.process_response(response) | |
return response | |
def stream_generate( | |
self, | |
prompt: str, | |
do_sample: bool = True, | |
max_length: int = 8192, | |
temperature: float = 0.8, | |
top_p: float = 0.8, | |
): | |
logits_processor = LogitsProcessorList() | |
logits_processor.append(InvalidScoreLogitsProcessor()) | |
gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, | |
"temperature": temperature, "logits_processor": logits_processor} | |
inputs = self.tokenizer([prompt], return_tensors="pt") | |
inputs = inputs.to(self.model.device) | |
for outputs in self.model.stream_generate(**inputs, **gen_kwargs): | |
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):] | |
response = self.tokenizer.decode(outputs) | |
if response and response[-1] != "�": | |
response = self.model.process_response(response) | |
yield response | |
def stream_chat( | |
self, | |
query: str, | |
history: List[Tuple[str, str]], | |
max_length: int = 8192, | |
do_sample=True, | |
top_p=0.8, | |
temperature=0.8 | |
): | |
stream = self.model.stream_chat(self.tokenizer, query, history, | |
max_length=max_length, do_sample=do_sample, top_p=top_p, temperature=temperature) | |
for resp, new_history in stream: | |
yield resp, new_history | |