import os import torch import transformers os.environ["CUDA_VISIBLE_DEVICES"] = "0" class ChatService: def __init__(self): pass @staticmethod def load_model(model_name=""): global tokenizer, pipeline print("Loading " + model_name + "...") # config gpu_count = torch.cuda.device_count() print('gpu_count', gpu_count) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) pipeline = transformers.pipeline( task="text-generation", model=model_name, torch_dtype=torch.float16, device_map="auto", ) @staticmethod def generate_message(req): history = req["chat"] assistant_name = req["assistant_name"] + ": " system_message = req.get("system_message") if req.get("system_message") is not None else "" temperature = req.get("temperature") if req.get("temperature") is not None else 1 top_p = req.get("top_p") if req.get("top_p") is not None else 1 top_k = req.get("top_k") if req.get("top_k") is not None else 10 max_length = req.get("max_length") if req.get("max_length") is not None else 1000 ending_tag = "[/INST]" fulltext = "[INST] <>" + system_message + "<>" + "\n\n".join( history) + "\n\n" + assistant_name + ending_tag sequences = pipeline( fulltext, do_sample=True, temperature=temperature, top_k=top_k, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=max_length, ) response = sequences[0]['generated_text'].split(ending_tag)[1].split(assistant_name) response = response[1] if len(response) > 1 else response[0] response = response.strip() return response