from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from langchain.llms import HuggingFacePipeline from langchain import PromptTemplate, LLMChain template = """{char_name}'s Persona: {char_persona} {chat_history} {char_name}: {char_greeting} {user_name}: {user_input} {char_name}: """ class EndpointHandler(): def __init__(self, path=""): tokenizer = AutoTokenizer.from_pretrained(path) model = AutoModelForCausalLM.from_pretrained(path, load_in_8bit = True, device_map = "auto") local_llm = HuggingFacePipeline( pipeline = pipeline( "text-generation", model = model, tokenizer = tokenizer, max_length = 2048, temperature = 0.5, top_p = 0.9, top_k = 0, repetition_penalty = 1.1, pad_token_id = 50256, num_return_sequences = 1 ) ) prompt_template = PromptTemplate( template = template, input_variables = [ "user_input", "user_name", "char_name", "char_persona", "char_greeting", "chat_history" ], validate_template = True ) self.llm_engine = LLMChain( llm = local_llm, prompt = prompt_template ) def __call__(self, data): inputs = data.pop("inputs", data) try: response = self.llm_engine.predict( user_input = inputs["user_input"], user_name = inputs["user_name"], char_name = inputs["char_name"], char_persona = inputs["char_persona"], char_greeting = inputs["char_greeting"], chat_history = inputs["chat_history"] ).split("\n",1)[0] return { "inputs": inputs, "text": response } except Exception as e: return { "inputs": inputs, "error": str(e) }