Update handler.py
Browse files- handler.py +27 -11
handler.py
CHANGED
@@ -34,17 +34,33 @@ class EndpointHandler():
|
|
34 |
|
35 |
# Create LLM
|
36 |
model_id = path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
tokenizer = AutoTokenizer.from_pretrained(
|
39 |
model_id,
|
40 |
-
trust_remote_code=True,
|
41 |
-
padding_side="left",
|
42 |
-
add_eos_token=True,
|
43 |
-
use_fast=False
|
44 |
)
|
45 |
tokenizer.pad_token = tokenizer.eos_token
|
46 |
-
|
47 |
-
model = AutoModelForCausalLM.from_pretrained(model_id)
|
48 |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
|
49 |
chat = HuggingFacePipeline(pipeline=pipe)
|
50 |
|
@@ -70,10 +86,10 @@ class EndpointHandler():
|
|
70 |
all_splits = text_splitter.split_documents(data)
|
71 |
|
72 |
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
|
73 |
-
retriever = vectorstore.as_retriever(
|
74 |
|
75 |
-
compressor = LLMChainExtractor.from_llm(chat)
|
76 |
-
retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
|
77 |
|
78 |
_template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
|
79 |
Chat History:
|
@@ -148,7 +164,7 @@ class EndpointHandler():
|
|
148 |
# This will be improved in the future
|
149 |
# For now you need to save it yourself
|
150 |
|
151 |
-
|
152 |
-
|
153 |
|
154 |
return result
|
|
|
34 |
|
35 |
# Create LLM
|
36 |
model_id = path
|
37 |
+
|
38 |
+
model = AutoModelForCausalLM.from_pretrained(
|
39 |
+
model_id,
|
40 |
+
device_map={"": "cuda"},
|
41 |
+
torch_dtype=torch.bfloat16,
|
42 |
+
load_in_8bit=True
|
43 |
+
)
|
44 |
+
model.eval()
|
45 |
|
46 |
+
# model_kwargs = {
|
47 |
+
# "input_ids":input_ids,
|
48 |
+
# "max_new_tokens":1024,
|
49 |
+
# "do_sample":True,
|
50 |
+
# "top_k":50,
|
51 |
+
# "top_p":self.top_p,
|
52 |
+
# "temperature":self.temperature,
|
53 |
+
# "repetition_penalty":1.2,
|
54 |
+
# "eos_token_id":self.tokenizer.eos_token_id,
|
55 |
+
# "bos_token_id":self.tokenizer.bos_token_id,
|
56 |
+
# "pad_token_id":self.tokenizer.pad_token_id
|
57 |
+
# }
|
58 |
+
|
59 |
tokenizer = AutoTokenizer.from_pretrained(
|
60 |
model_id,
|
|
|
|
|
|
|
|
|
61 |
)
|
62 |
tokenizer.pad_token = tokenizer.eos_token
|
63 |
+
|
|
|
64 |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
|
65 |
chat = HuggingFacePipeline(pipeline=pipe)
|
66 |
|
|
|
86 |
all_splits = text_splitter.split_documents(data)
|
87 |
|
88 |
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
|
89 |
+
retriever = vectorstore.as_retriever()
|
90 |
|
91 |
+
# compressor = LLMChainExtractor.from_llm(chat)
|
92 |
+
# retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
|
93 |
|
94 |
_template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
|
95 |
Chat History:
|
|
|
164 |
# This will be improved in the future
|
165 |
# For now you need to save it yourself
|
166 |
|
167 |
+
self.memory.save_context(inputs, {"answer": result["answer"].content})
|
168 |
+
self.memory.load_memory_variables({})
|
169 |
|
170 |
return result
|