lchakkei
/

2

@@ -28,117 +28,159 @@ from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
 from langchain_core.runnables import RunnableParallel
 class EndpointHandler():
     def __init__(self, path=""):
         # Config LangChain
         os.environ["LANGCHAIN_TRACING_V2"] = "true"
         os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
-        # Create LLM
-        model_id = path
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            device_map='auto',
-            torch_dtype=torch.float16,
-            load_in_8bit=True
-        )
-        model.eval()
-        # model_kwargs = {
-        #     "input_ids":input_ids,
-        #     "max_new_tokens":1024,
-        #     "do_sample":True,
-        #     "top_k":50,
-        #     "top_p":self.top_p,
-        #     "temperature":self.temperature,
-        #     "repetition_penalty":1.2,
-        #     "eos_token_id":self.tokenizer.eos_token_id,
-        #     "bos_token_id":self.tokenizer.bos_token_id,
-        #     "pad_token_id":self.tokenizer.pad_token_id
-        # }
-        model_kwargs = {
-            "do_sample": True,
-            "temperature": 0.2,
-            "max_length": 1024
-        }
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
-        chat = HuggingFacePipeline(pipeline=pipe, model_kwargs=model_kwargs)
-        # Create Text-Embedding Model
-        embedding_function = HuggingFaceBgeEmbeddings(
-            model_name="mixedbread-ai/mxbai-embed-large-v1",
-            model_kwargs={'device': 'cuda'},
-            encode_kwargs={'normalize_embeddings': True}
-        )
         # Load Vector db
         urls = [
             "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html",
             "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html",
             "https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu",
-            "https://www.cityu.edu.hk/president/about"
         ]
         loader = WebBaseLoader(urls)
-        data = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
-        all_splits = text_splitter.split_documents(data)
-        vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
-        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})
-        # compressor = LLMChainExtractor.from_llm(chat)
-        # compression_retriever = ContextualCompressionRetriever(
-        #     base_compressor=compressor, base_retriever=retriever
-        # )
-        template = """Use the following pieces of context to answer the question at the end.
-        If you don't know the answer, just say that you don't know, don't try to make up an answer.
-        Use three sentences maximum and keep the answer as concise as possible.
-        Always say "thanks for asking!" at the end of the answer.
-        {context}
-        Question: {question}
-        Helpful Answer:"""
-        custom_rag_prompt = PromptTemplate.from_template(template)
-        rag_chain_from_docs = (
-            RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
-            | custom_rag_prompt
-            | chat
-            | StrOutputParser()
         )
-        self.rag_chain_with_source = RunnableParallel(
-            {"context": retriever, "question": RunnablePassthrough()}
-        ).assign(answer=rag_chain_from_docs)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         # get inputs
         inputs = data.pop("inputs",data)
         date = data.pop("date", None)
-        result = self.rag_chain_with_source.invoke(inputs)
-        #answer = result['answer']
-        # Note that the memory does not save automatically
-        # This will be improved in the future
-        # For now you need to save it yourself
-        # self.memory.save_context(inputs, {"answer": answer})
-        #self.memory.load_memory_variables({})
-        return result

 from langchain_core.runnables import RunnableParallel
 class EndpointHandler():
+    def split_documents(
+        chunk_size: int,
+        knowledge_base: [],
+        tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
+    ):
+        """
+        Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
+        """
+        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+            AutoTokenizer.from_pretrained(tokenizer_name),
+            chunk_size=chunk_size,
+            chunk_overlap=int(chunk_size / 10),
+            add_start_index=True,
+            strip_whitespace=True,
+            separators=MARKDOWN_SEPARATORS,
+        )
+        docs_processed = []
+        for doc in knowledge_base:
+            docs_processed += text_splitter.split_documents([doc])
+        # Remove duplicates
+        unique_texts = {}
+        docs_processed_unique = []
+        for doc in docs_processed:
+            if doc.page_content not in unique_texts:
+                unique_texts[doc.page_content] = True
+                docs_processed_unique.append(doc)
+        return docs_processed_unique
     def __init__(self, path=""):
         # Config LangChain
         os.environ["LANGCHAIN_TRACING_V2"] = "true"
         os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
+        EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
         # Load Vector db
         urls = [
             "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html",
             "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html",
+            "https://www.cityu.edu.hk/media/press-release/2022/05/17/cityu-council-announces-appointment-professor-freddy-boey-next-president",
             "https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu",
         ]
         loader = WebBaseLoader(urls)
+        docs = loader.load()
+        MARKDOWN_SEPARATORS = [
+            "\n#{1,6} ",
+            "```\n",
+            "\n\\*\\*\\*+\n",
+            "\n---+\n",
+            "\n___+\n",
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,  # the maximum number of characters in a chunk: we selected this value arbitrarily
+            chunk_overlap=100,  # the number of characters to overlap between chunks
+            add_start_index=True,  # If `True`, includes chunk's start index in metadata
+            strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
+            separators=MARKDOWN_SEPARATORS,
+        )
+        docs_processed = text_splitter.split_documents(docs)
+        docs_processed = split_documents(
+            512,  # We choose a chunk size adapted to our model
+            docs,
+            tokenizer_name=EMBEDDING_MODEL_NAME,
+        )
+        embedding_model = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME,
+            multi_process=True,
+            model_kwargs={"device": "cuda"},
+            encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
+        )
+        self.vectorstore = FAISS.from_documents(
+            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
+        )
+        # Create LLM
+        READER_MODEL_NAME = path
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+        model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
+        tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
+        # Testing
+        # tokenizer.pad_token = tokenizer.eos_token
+        READER_LLM = pipeline(
+            model=model,
+            tokenizer=tokenizer,
+            task="text-generation",
+            do_sample=True,
+            temperature=0.2,
+            repetition_penalty=1.1,
+            return_full_text=False,
+            max_new_tokens=256,
+        )
+        prompt_in_chat_format = [
+            {
+                "role": "system",
+                "content": """Using the information contained in the context.
+        Respond only to the question asked, response should be concise and relevant to the question.
+        If the answer cannot be deduced from the context, do not give an answer.""",
+            },
+            {
+                "role": "user",
+                "content": """Context: {context}
+        Now here is the question you need to answer.
+        Question: {question}""",
+            },
+        ]
+        self.RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
+            prompt_in_chat_format, tokenize=False, add_generation_prompt=True
         )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         # get inputs
         inputs = data.pop("inputs",data)
         date = data.pop("date", None)
+        retrieved_docs = self.vectorstore.similarity_search(query=inputs, k=2)
+        retrieved_docs_text = [
+            doc.page_content for doc in retrieved_docs
+        ]  # we only need the text of the documents
+        context = "\nExtracted documents:\n"
+        context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])
+        final_prompt = self.RAG_PROMPT_TEMPLATE.format(
+            question=inputs, context=context
+        )
+        # Redact an answer
+        answer = READER_LLM(final_prompt)[0]["generated_text"]
+        return answer