lchakkei
/

Mistral-7B-V2-Traditional-Chinese

@@ -14,7 +14,6 @@ from langchain.memory import ConversationBufferMemory
 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.document_loaders import WebBaseLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from llm_for_langchain import LLM
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
@@ -25,7 +24,6 @@ from operator import itemgetter
 from langchain.schema import format_document
 from langchain.memory import ConversationBufferMemory
 from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
 class EndpointHandler():
     def __init__(self, path=""):
@@ -37,32 +35,31 @@ class EndpointHandler():
         # Create LLM
         # load the tokenizer and the quantized mistral model
-        model = AutoModelForCausalLM.from_pretrained(
-                     path,
-                     device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(path)
-        # using HuggingFace's pipeline
-        pipeline = pipeline(
-               "text-generation",
-               model=model,
-               tokenizer=tokenizer,
-               use_cache=True,
-               device_map="auto",
-               max_new_tokens=5000,
-               do_sample=True,
-               top_k=1,
-               temperature = 0.01,
-               num_return_sequences=1,
-               eos_token_id=tokenizer.eos_token_id,
-               pad_token_id=tokenizer.eos_token_id,
         )
-        chat = HuggingFacePipeline(pipeline=pipeline)
         # Create Text-Embedding Model
         embedding_function = HuggingFaceBgeEmbeddings(
-            model_name="DMetaSoul/Dmeta-embedding",
             model_kwargs={'device': 'cuda'},
             encode_kwargs={'normalize_embeddings': True}
         )
@@ -100,7 +97,7 @@ class EndpointHandler():
         Question: {question} [/INST]
         """
         ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
         self.memory = ConversationBufferMemory(
@@ -119,7 +116,7 @@ class EndpointHandler():
                 "chat_history": lambda x: get_buffer_string(x["chat_history"]),
             }
             | CONDENSE_QUESTION_PROMPT
-            | chat(temperature=0)
             | StrOutputParser(),
         }
@@ -150,16 +147,17 @@ class EndpointHandler():
         self.final_chain = loaded_memory | standalone_question | retrieved_documents | answer
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        # pseudo
-        # self.model(input)
-        inputs = data.pop("inputs", data)
-        result = self.final_chain.invoke(inputs)
-        print(result['answer'])
         # Note that the memory does not save automatically
         # This will be improved in the future
         # For now you need to save it yourself
         self.memory.save_context(inputs, {"answer": result["answer"].content})
         self.memory.load_memory_variables({})
-        return result

 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.document_loaders import WebBaseLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema import format_document
 from langchain.memory import ConversationBufferMemory
 from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
 class EndpointHandler():
     def __init__(self, path=""):
         # Create LLM
         # load the tokenizer and the quantized mistral model
+        # chat = HuggingFacePipeline.from_model_id(
+        #     model_id=path,
+        #     task="text-generation",
+        #     device=0,
+        #     pipeline_kwargs={"max_new_tokens": 1024},
+        # )
+        model_id = path
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            padding_side="left",
+            add_eos_token=True,
+            use_fast=False
         )
+        tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
+        chat = HuggingFacePipeline(pipeline=pipe)
         # Create Text-Embedding Model
         embedding_function = HuggingFaceBgeEmbeddings(
+            model_name="BAAI/bge-large-zh",
             model_kwargs={'device': 'cuda'},
             encode_kwargs={'normalize_embeddings': True}
         )
         Question: {question} [/INST]
         """
         ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
         self.memory = ConversationBufferMemory(
                 "chat_history": lambda x: get_buffer_string(x["chat_history"]),
             }
             | CONDENSE_QUESTION_PROMPT
+            | chat
             | StrOutputParser(),
         }
         self.final_chain = loaded_memory | standalone_question | retrieved_documents | answer
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        # get inputs
+        inputs = data.pop("inputs",data)
+        date = data.pop("date", None)
+        result = self.final_chain.invoke({"question": inputs})
         # Note that the memory does not save automatically
         # This will be improved in the future
         # For now you need to save it yourself
         self.memory.save_context(inputs, {"answer": result["answer"].content})
         self.memory.load_memory_variables({})
+        return result