lchakkei commited on
Commit
82a98ce
1 Parent(s): c3714b0

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +29 -31
handler.py CHANGED
@@ -14,7 +14,6 @@ from langchain.memory import ConversationBufferMemory
14
  from langchain.embeddings import HuggingFaceBgeEmbeddings
15
  from langchain.document_loaders import WebBaseLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from llm_for_langchain import LLM
18
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
19
  from langchain.chains.combine_documents import create_stuff_documents_chain
20
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
@@ -25,7 +24,6 @@ from operator import itemgetter
25
  from langchain.schema import format_document
26
  from langchain.memory import ConversationBufferMemory
27
  from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
28
-
29
 
30
  class EndpointHandler():
31
  def __init__(self, path=""):
@@ -37,32 +35,31 @@ class EndpointHandler():
37
  # Create LLM
38
 
39
  # load the tokenizer and the quantized mistral model
40
- model = AutoModelForCausalLM.from_pretrained(
41
- path,
42
- device_map="auto")
43
-
44
- tokenizer = AutoTokenizer.from_pretrained(path)
 
 
 
45
 
46
- # using HuggingFace's pipeline
47
- pipeline = pipeline(
48
- "text-generation",
49
- model=model,
50
- tokenizer=tokenizer,
51
- use_cache=True,
52
- device_map="auto",
53
- max_new_tokens=5000,
54
- do_sample=True,
55
- top_k=1,
56
- temperature = 0.01,
57
- num_return_sequences=1,
58
- eos_token_id=tokenizer.eos_token_id,
59
- pad_token_id=tokenizer.eos_token_id,
60
  )
61
- chat = HuggingFacePipeline(pipeline=pipeline)
 
 
 
 
62
 
63
  # Create Text-Embedding Model
64
  embedding_function = HuggingFaceBgeEmbeddings(
65
- model_name="DMetaSoul/Dmeta-embedding",
66
  model_kwargs={'device': 'cuda'},
67
  encode_kwargs={'normalize_embeddings': True}
68
  )
@@ -100,7 +97,7 @@ class EndpointHandler():
100
 
101
  Question: {question} [/INST]
102
  """
103
-
104
  ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
105
 
106
  self.memory = ConversationBufferMemory(
@@ -119,7 +116,7 @@ class EndpointHandler():
119
  "chat_history": lambda x: get_buffer_string(x["chat_history"]),
120
  }
121
  | CONDENSE_QUESTION_PROMPT
122
- | chat(temperature=0)
123
  | StrOutputParser(),
124
  }
125
 
@@ -150,16 +147,17 @@ class EndpointHandler():
150
  self.final_chain = loaded_memory | standalone_question | retrieved_documents | answer
151
 
152
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
153
- # pseudo
154
- # self.model(input)
155
- inputs = data.pop("inputs", data)
156
- result = self.final_chain.invoke(inputs)
157
- print(result['answer'])
158
 
159
  # Note that the memory does not save automatically
160
  # This will be improved in the future
161
  # For now you need to save it yourself
 
162
  self.memory.save_context(inputs, {"answer": result["answer"].content})
163
  self.memory.load_memory_variables({})
164
 
165
- return result
 
14
  from langchain.embeddings import HuggingFaceBgeEmbeddings
15
  from langchain.document_loaders import WebBaseLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
17
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
18
  from langchain.chains.combine_documents import create_stuff_documents_chain
19
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 
24
  from langchain.schema import format_document
25
  from langchain.memory import ConversationBufferMemory
26
  from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
 
27
 
28
  class EndpointHandler():
29
  def __init__(self, path=""):
 
35
  # Create LLM
36
 
37
  # load the tokenizer and the quantized mistral model
38
+ # chat = HuggingFacePipeline.from_model_id(
39
+ # model_id=path,
40
+ # task="text-generation",
41
+ # device=0,
42
+ # pipeline_kwargs={"max_new_tokens": 1024},
43
+ # )
44
+
45
+ model_id = path
46
 
47
+ tokenizer = AutoTokenizer.from_pretrained(
48
+ model_id,
49
+ trust_remote_code=True,
50
+ padding_side="left",
51
+ add_eos_token=True,
52
+ use_fast=False
 
 
 
 
 
 
 
 
53
  )
54
+ tokenizer.pad_token = tokenizer.eos_token
55
+
56
+ model = AutoModelForCausalLM.from_pretrained(model_id)
57
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
58
+ chat = HuggingFacePipeline(pipeline=pipe)
59
 
60
  # Create Text-Embedding Model
61
  embedding_function = HuggingFaceBgeEmbeddings(
62
+ model_name="BAAI/bge-large-zh",
63
  model_kwargs={'device': 'cuda'},
64
  encode_kwargs={'normalize_embeddings': True}
65
  )
 
97
 
98
  Question: {question} [/INST]
99
  """
100
+
101
  ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
102
 
103
  self.memory = ConversationBufferMemory(
 
116
  "chat_history": lambda x: get_buffer_string(x["chat_history"]),
117
  }
118
  | CONDENSE_QUESTION_PROMPT
119
+ | chat
120
  | StrOutputParser(),
121
  }
122
 
 
147
  self.final_chain = loaded_memory | standalone_question | retrieved_documents | answer
148
 
149
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
150
+ # get inputs
151
+ inputs = data.pop("inputs",data)
152
+ date = data.pop("date", None)
153
+
154
+ result = self.final_chain.invoke({"question": inputs})
155
 
156
  # Note that the memory does not save automatically
157
  # This will be improved in the future
158
  # For now you need to save it yourself
159
+
160
  self.memory.save_context(inputs, {"answer": result["answer"].content})
161
  self.memory.load_memory_variables({})
162
 
163
+ return result