lchakkei commited on
Commit
6f01379
·
verified ·
1 Parent(s): e0c7c1b

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +124 -82
handler.py CHANGED
@@ -28,117 +28,159 @@ from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
28
  from langchain_core.runnables import RunnableParallel
29
 
30
  class EndpointHandler():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def __init__(self, path=""):
32
 
33
  # Config LangChain
34
  os.environ["LANGCHAIN_TRACING_V2"] = "true"
35
  os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
36
 
37
- # Create LLM
38
- model_id = path
39
-
40
- model = AutoModelForCausalLM.from_pretrained(
41
- model_id,
42
- device_map='auto',
43
- torch_dtype=torch.float16,
44
- load_in_8bit=True
45
- )
46
- model.eval()
47
-
48
- # model_kwargs = {
49
- # "input_ids":input_ids,
50
- # "max_new_tokens":1024,
51
- # "do_sample":True,
52
- # "top_k":50,
53
- # "top_p":self.top_p,
54
- # "temperature":self.temperature,
55
- # "repetition_penalty":1.2,
56
- # "eos_token_id":self.tokenizer.eos_token_id,
57
- # "bos_token_id":self.tokenizer.bos_token_id,
58
- # "pad_token_id":self.tokenizer.pad_token_id
59
- # }
60
-
61
- model_kwargs = {
62
- "do_sample": True,
63
- "temperature": 0.2,
64
- "max_length": 1024
65
- }
66
-
67
- tokenizer = AutoTokenizer.from_pretrained(
68
- model_id,
69
- )
70
- tokenizer.pad_token = tokenizer.eos_token
71
-
72
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
73
- chat = HuggingFacePipeline(pipeline=pipe, model_kwargs=model_kwargs)
74
-
75
- # Create Text-Embedding Model
76
- embedding_function = HuggingFaceBgeEmbeddings(
77
- model_name="mixedbread-ai/mxbai-embed-large-v1",
78
- model_kwargs={'device': 'cuda'},
79
- encode_kwargs={'normalize_embeddings': True}
80
- )
81
 
82
  # Load Vector db
83
  urls = [
84
  "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html",
85
  "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html",
 
86
  "https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu",
87
- "https://www.cityu.edu.hk/president/about"
88
  ]
89
-
90
  loader = WebBaseLoader(urls)
91
- data = loader.load()
92
 
93
- text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
94
- all_splits = text_splitter.split_documents(data)
 
 
 
 
 
 
 
 
 
95
 
96
- vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
97
- retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})
 
 
 
 
 
 
 
98
 
99
- # compressor = LLMChainExtractor.from_llm(chat)
100
- # compression_retriever = ContextualCompressionRetriever(
101
- # base_compressor=compressor, base_retriever=retriever
102
- # )
 
103
 
104
- template = """Use the following pieces of context to answer the question at the end.
105
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
106
- Use three sentences maximum and keep the answer as concise as possible.
107
- Always say "thanks for asking!" at the end of the answer.
 
 
108
 
109
- {context}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- Question: {question}
 
 
 
 
 
 
 
 
 
112
 
113
- Helpful Answer:"""
114
- custom_rag_prompt = PromptTemplate.from_template(template)
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- rag_chain_from_docs = (
117
- RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
118
- | custom_rag_prompt
119
- | chat
120
- | StrOutputParser()
121
  )
122
 
123
- self.rag_chain_with_source = RunnableParallel(
124
- {"context": retriever, "question": RunnablePassthrough()}
125
- ).assign(answer=rag_chain_from_docs)
126
-
127
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
128
  # get inputs
129
  inputs = data.pop("inputs",data)
130
  date = data.pop("date", None)
131
 
132
- result = self.rag_chain_with_source.invoke(inputs)
133
-
134
- #answer = result['answer']
135
-
136
- # Note that the memory does not save automatically
137
- # This will be improved in the future
138
- # For now you need to save it yourself
139
- # self.memory.save_context(inputs, {"answer": answer})
140
 
141
- #self.memory.load_memory_variables({})
 
 
 
 
 
 
 
 
142
 
143
- return result
 
 
 
144
 
 
28
  from langchain_core.runnables import RunnableParallel
29
 
30
  class EndpointHandler():
31
+ def split_documents(
32
+ chunk_size: int,
33
+ knowledge_base: [],
34
+ tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
35
+ ):
36
+ """
37
+ Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
38
+ """
39
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
40
+ AutoTokenizer.from_pretrained(tokenizer_name),
41
+ chunk_size=chunk_size,
42
+ chunk_overlap=int(chunk_size / 10),
43
+ add_start_index=True,
44
+ strip_whitespace=True,
45
+ separators=MARKDOWN_SEPARATORS,
46
+ )
47
+
48
+ docs_processed = []
49
+ for doc in knowledge_base:
50
+ docs_processed += text_splitter.split_documents([doc])
51
+
52
+ # Remove duplicates
53
+ unique_texts = {}
54
+ docs_processed_unique = []
55
+ for doc in docs_processed:
56
+ if doc.page_content not in unique_texts:
57
+ unique_texts[doc.page_content] = True
58
+ docs_processed_unique.append(doc)
59
+
60
+ return docs_processed_unique
61
+
62
  def __init__(self, path=""):
63
 
64
  # Config LangChain
65
  os.environ["LANGCHAIN_TRACING_V2"] = "true"
66
  os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
67
 
68
+ EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # Load Vector db
71
  urls = [
72
  "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html",
73
  "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html",
74
+ "https://www.cityu.edu.hk/media/press-release/2022/05/17/cityu-council-announces-appointment-professor-freddy-boey-next-president",
75
  "https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu",
 
76
  ]
77
+
78
  loader = WebBaseLoader(urls)
79
+ docs = loader.load()
80
 
81
+ MARKDOWN_SEPARATORS = [
82
+ "\n#{1,6} ",
83
+ "```\n",
84
+ "\n\\*\\*\\*+\n",
85
+ "\n---+\n",
86
+ "\n___+\n",
87
+ "\n\n",
88
+ "\n",
89
+ " ",
90
+ "",
91
+ ]
92
 
93
+ text_splitter = RecursiveCharacterTextSplitter(
94
+ chunk_size=1000, # the maximum number of characters in a chunk: we selected this value arbitrarily
95
+ chunk_overlap=100, # the number of characters to overlap between chunks
96
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
97
+ strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
98
+ separators=MARKDOWN_SEPARATORS,
99
+ )
100
+
101
+ docs_processed = text_splitter.split_documents(docs)
102
 
103
+ docs_processed = split_documents(
104
+ 512, # We choose a chunk size adapted to our model
105
+ docs,
106
+ tokenizer_name=EMBEDDING_MODEL_NAME,
107
+ )
108
 
109
+ embedding_model = HuggingFaceEmbeddings(
110
+ model_name=EMBEDDING_MODEL_NAME,
111
+ multi_process=True,
112
+ model_kwargs={"device": "cuda"},
113
+ encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
114
+ )
115
 
116
+ self.vectorstore = FAISS.from_documents(
117
+ docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
118
+ )
119
+
120
+ # Create LLM
121
+ READER_MODEL_NAME = path
122
+
123
+ bnb_config = BitsAndBytesConfig(
124
+ load_in_4bit=True,
125
+ bnb_4bit_use_double_quant=True,
126
+ bnb_4bit_quant_type="nf4",
127
+ bnb_4bit_compute_dtype=torch.bfloat16,
128
+ )
129
+ model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
130
+ tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
131
+
132
+ # Testing
133
+ # tokenizer.pad_token = tokenizer.eos_token
134
 
135
+ READER_LLM = pipeline(
136
+ model=model,
137
+ tokenizer=tokenizer,
138
+ task="text-generation",
139
+ do_sample=True,
140
+ temperature=0.2,
141
+ repetition_penalty=1.1,
142
+ return_full_text=False,
143
+ max_new_tokens=256,
144
+ )
145
 
146
+ prompt_in_chat_format = [
147
+ {
148
+ "role": "system",
149
+ "content": """Using the information contained in the context.
150
+ Respond only to the question asked, response should be concise and relevant to the question.
151
+ If the answer cannot be deduced from the context, do not give an answer.""",
152
+ },
153
+ {
154
+ "role": "user",
155
+ "content": """Context: {context}
156
+ Now here is the question you need to answer.
157
+ Question: {question}""",
158
+ },
159
+ ]
160
 
161
+ self.RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
162
+ prompt_in_chat_format, tokenize=False, add_generation_prompt=True
 
 
 
163
  )
164
 
 
 
 
 
165
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
166
  # get inputs
167
  inputs = data.pop("inputs",data)
168
  date = data.pop("date", None)
169
 
170
+ retrieved_docs = self.vectorstore.similarity_search(query=inputs, k=2)
 
 
 
 
 
 
 
171
 
172
+ retrieved_docs_text = [
173
+ doc.page_content for doc in retrieved_docs
174
+ ] # we only need the text of the documents
175
+ context = "\nExtracted documents:\n"
176
+ context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])
177
+
178
+ final_prompt = self.RAG_PROMPT_TEMPLATE.format(
179
+ question=inputs, context=context
180
+ )
181
 
182
+ # Redact an answer
183
+ answer = READER_LLM(final_prompt)[0]["generated_text"]
184
+
185
+ return answer
186