lchakkei
/

Mistral-7B-V2-Traditional-Chinese

@@ -29,16 +29,16 @@ class EndpointHandler():
     def __init__(self, path=""):
         # Config LangChain
-        # os.environ["LANGCHAIN_TRACING_V2"] = "true"
         # os.environ["LANGCHAIN_API_KEY"] =
         # Create LLM
-        model_id = path
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map={"": "cuda"},
-            torch_dtype=torch.bfloat16,
             load_in_8bit=True
         )
         model.eval()
@@ -66,15 +66,13 @@ class EndpointHandler():
         # Create Text-Embedding Model
         embedding_function = HuggingFaceBgeEmbeddings(
-            model_name="BAAI/bge-large-zh",
             model_kwargs={'device': 'cuda'},
             encode_kwargs={'normalize_embeddings': True}
         )
         # Load Vector db
         urls = [
-            "https://hk.on.cc/hk/bkn/cnt/news/20221019/bkn-20221019040039334-1019_00822_001.html",
-            "https://www.hk01.com/%E7%A4%BE%E6%9C%83%E6%96%B0%E8%81%9E/822848/%E5%89%B5%E7%A7%91%E7%B2%BE%E8%8B%B1-%E5%87%BA%E6%88%B02022%E4%B8%96%E7%95%8C%E6%8A%80%E8%83%BD%E5%A4%A7%E8%B3%BD%E7%89%B9%E5%88%A5%E8%B3%BD",
             "https://www.wenweipo.com/epaper/view/newsDetail/1582436861224292352.html",
             "https://www.thinkhk.com/article/2023-03/24/59874.html"
         ]
@@ -87,9 +85,11 @@ class EndpointHandler():
         vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
         retriever = vectorstore.as_retriever()
-        # compressor = LLMChainExtractor.from_llm(chat)
-        # retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
         _template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
         Chat History:
@@ -160,11 +160,13 @@ class EndpointHandler():
         result = self.final_chain.invoke({"question": inputs})
         # Note that the memory does not save automatically
         # This will be improved in the future
         # For now you need to save it yourself
-        self.memory.save_context(inputs, {"answer": result["answer"].content})
         self.memory.load_memory_variables({})
-        return result

     def __init__(self, path=""):
         # Config LangChain
+        os.environ["LANGCHAIN_TRACING_V2"] = "true"
         # os.environ["LANGCHAIN_API_KEY"] =
         # Create LLM
+        model_id = "mistralai/Mistral-7B-Instruct-v0.1"
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map={"": "cuda"},
+            torch_dtype=torch.float16,
             load_in_8bit=True
         )
         model.eval()
         # Create Text-Embedding Model
         embedding_function = HuggingFaceBgeEmbeddings(
+            model_name="DMetaSoul/Dmeta-embedding",
             model_kwargs={'device': 'cuda'},
             encode_kwargs={'normalize_embeddings': True}
         )
         # Load Vector db
         urls = [
             "https://www.wenweipo.com/epaper/view/newsDetail/1582436861224292352.html",
             "https://www.thinkhk.com/article/2023-03/24/59874.html"
         ]
         vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
         retriever = vectorstore.as_retriever()
+        compressor = LLMChainExtractor.from_llm(chat)
+        compression_retriever = ContextualCompressionRetriever(
+            base_compressor=compressor, base_retriever=retriever
+        )
         _template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
         Chat History:
         result = self.final_chain.invoke({"question": inputs})
+        answer = result['answer']
         # Note that the memory does not save automatically
         # This will be improved in the future
         # For now you need to save it yourself
+        # self.memory.save_context(inputs, {"answer": answer})
         self.memory.load_memory_variables({})
+        return answer