lchakkei commited on
Commit
d40c010
1 Parent(s): 0dc190f

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +27 -11
handler.py CHANGED
@@ -34,17 +34,33 @@ class EndpointHandler():
34
 
35
  # Create LLM
36
  model_id = path
 
 
 
 
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  tokenizer = AutoTokenizer.from_pretrained(
39
  model_id,
40
- trust_remote_code=True,
41
- padding_side="left",
42
- add_eos_token=True,
43
- use_fast=False
44
  )
45
  tokenizer.pad_token = tokenizer.eos_token
46
-
47
- model = AutoModelForCausalLM.from_pretrained(model_id)
48
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
49
  chat = HuggingFacePipeline(pipeline=pipe)
50
 
@@ -70,10 +86,10 @@ class EndpointHandler():
70
  all_splits = text_splitter.split_documents(data)
71
 
72
  vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
73
- retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
74
 
75
- compressor = LLMChainExtractor.from_llm(chat)
76
- retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
77
 
78
  _template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
79
  Chat History:
@@ -148,7 +164,7 @@ class EndpointHandler():
148
  # This will be improved in the future
149
  # For now you need to save it yourself
150
 
151
- # self.memory.save_context(inputs, {"answer": result["answer"].content})
152
- # self.memory.load_memory_variables({})
153
 
154
  return result
 
34
 
35
  # Create LLM
36
  model_id = path
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ model_id,
40
+ device_map={"": "cuda"},
41
+ torch_dtype=torch.bfloat16,
42
+ load_in_8bit=True
43
+ )
44
+ model.eval()
45
 
46
+ # model_kwargs = {
47
+ # "input_ids":input_ids,
48
+ # "max_new_tokens":1024,
49
+ # "do_sample":True,
50
+ # "top_k":50,
51
+ # "top_p":self.top_p,
52
+ # "temperature":self.temperature,
53
+ # "repetition_penalty":1.2,
54
+ # "eos_token_id":self.tokenizer.eos_token_id,
55
+ # "bos_token_id":self.tokenizer.bos_token_id,
56
+ # "pad_token_id":self.tokenizer.pad_token_id
57
+ # }
58
+
59
  tokenizer = AutoTokenizer.from_pretrained(
60
  model_id,
 
 
 
 
61
  )
62
  tokenizer.pad_token = tokenizer.eos_token
63
+
 
64
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
65
  chat = HuggingFacePipeline(pipeline=pipe)
66
 
 
86
  all_splits = text_splitter.split_documents(data)
87
 
88
  vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_function)
89
+ retriever = vectorstore.as_retriever()
90
 
91
+ # compressor = LLMChainExtractor.from_llm(chat)
92
+ # retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
93
 
94
  _template = """[INST] Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
95
  Chat History:
 
164
  # This will be improved in the future
165
  # For now you need to save it yourself
166
 
167
+ self.memory.save_context(inputs, {"answer": result["answer"].content})
168
+ self.memory.load_memory_variables({})
169
 
170
  return result