lchakkei
/

Mistral-7B-V2-Traditional-Chinese

@@ -34,17 +34,33 @@ class EndpointHandler():
         # Create LLM
         model_id = path
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
-            trust_remote_code=True,
-            padding_side="left",
-            add_eos_token=True,
-            use_fast=False
         )
         tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelForCausalLM.from_pretrained(model_id)
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
         chat = HuggingFacePipeline(pipeline=pipe)
@@ -70,10 +86,10 @@ class EndpointHandler():
         all_splits = text_splitter.split_documents(data)
         vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
-        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
-        compressor = LLMChainExtractor.from_llm(chat)
-        retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
         _template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
         Chat History:
@@ -148,7 +164,7 @@ class EndpointHandler():
         # This will be improved in the future
         # For now you need to save it yourself
-        # self.memory.save_context(inputs, {"answer": result["answer"].content})
-        # self.memory.load_memory_variables({})
         return result

         # Create LLM
         model_id = path
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map={"": "cuda"},
+            torch_dtype=torch.bfloat16,
+            load_in_8bit=True
+        )
+        model.eval()
+        # model_kwargs = {
+        #     "input_ids":input_ids,
+        #     "max_new_tokens":1024,
+        #     "do_sample":True,
+        #     "top_k":50,
+        #     "top_p":self.top_p,
+        #     "temperature":self.temperature,
+        #     "repetition_penalty":1.2,
+        #     "eos_token_id":self.tokenizer.eos_token_id,
+        #     "bos_token_id":self.tokenizer.bos_token_id,
+        #     "pad_token_id":self.tokenizer.pad_token_id
+        # }
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
         )
         tokenizer.pad_token = tokenizer.eos_token
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
         chat = HuggingFacePipeline(pipeline=pipe)
         all_splits = text_splitter.split_documents(data)
         vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
+        retriever = vectorstore.as_retriever()
+        # compressor = LLMChainExtractor.from_llm(chat)
+        # retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
         _template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
         Chat History:
         # This will be improved in the future
         # For now you need to save it yourself
+        self.memory.save_context(inputs, {"answer": result["answer"].content})
+        self.memory.load_memory_variables({})
         return result