support concurrent access
Browse files- app_modules/llm_chat_chain.py +5 -5
- app_modules/llm_inference.py +12 -4
- app_modules/llm_loader.py +3 -7
app_modules/llm_chat_chain.py
CHANGED
@@ -12,11 +12,11 @@ def get_llama_2_prompt_template():
|
|
12 |
B_INST, E_INST = "[INST]", "[/INST]"
|
13 |
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
14 |
|
15 |
-
instruction = "Chat History:\n\n{chat_history} \n\nUser: {question}
|
16 |
-
|
17 |
-
system_prompt = """\
|
18 |
-
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\nDo not output any emotional expression. Read the chat history to get context.\
|
19 |
-
"""
|
20 |
|
21 |
SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
|
22 |
prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
|
|
|
12 |
B_INST, E_INST = "[INST]", "[/INST]"
|
13 |
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
14 |
|
15 |
+
instruction = "Chat History:\n\n{chat_history} \n\nUser: {question}"
|
16 |
+
system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. Read the chat history to get context"
|
17 |
+
# system_prompt = """\
|
18 |
+
# You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\nDo not output any emotional expression. Read the chat history to get context.\
|
19 |
+
# """
|
20 |
|
21 |
SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
|
22 |
prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
|
app_modules/llm_inference.py
CHANGED
@@ -46,7 +46,7 @@ class LLMInference(metaclass=abc.ABCMeta):
|
|
46 |
|
47 |
chain = self.get_chain(tracing)
|
48 |
result = (
|
49 |
-
self.
|
50 |
chain,
|
51 |
inputs,
|
52 |
streaming_handler,
|
@@ -68,12 +68,20 @@ class LLMInference(metaclass=abc.ABCMeta):
|
|
68 |
|
69 |
return result
|
70 |
|
71 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
que = Queue()
|
73 |
|
74 |
t = Thread(
|
75 |
-
target=
|
76 |
-
args=(
|
77 |
)
|
78 |
t.start()
|
79 |
|
|
|
46 |
|
47 |
chain = self.get_chain(tracing)
|
48 |
result = (
|
49 |
+
self._run_chain(
|
50 |
chain,
|
51 |
inputs,
|
52 |
streaming_handler,
|
|
|
68 |
|
69 |
return result
|
70 |
|
71 |
+
def _execute_chain(self, chain, inputs, q, sh):
|
72 |
+
self.llm_loader.lock.acquire()
|
73 |
+
try:
|
74 |
+
q.put(chain(inputs, callbacks=[sh]))
|
75 |
+
finally:
|
76 |
+
# Release the lock
|
77 |
+
self.llm_loader.lock.release()
|
78 |
+
|
79 |
+
def _run_chain(self, chain, inputs, streaming_handler):
|
80 |
que = Queue()
|
81 |
|
82 |
t = Thread(
|
83 |
+
target=self._execute_chain,
|
84 |
+
args=(chain, inputs, que, streaming_handler),
|
85 |
)
|
86 |
t.start()
|
87 |
|
app_modules/llm_loader.py
CHANGED
@@ -1,21 +1,15 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
-
import
|
4 |
-
import urllib
|
5 |
from queue import Queue
|
6 |
-
from threading import Thread
|
7 |
from typing import Any, Optional
|
8 |
|
9 |
import torch
|
10 |
from langchain.callbacks.base import BaseCallbackHandler
|
11 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
12 |
-
from langchain.callbacks.tracers import LangChainTracer
|
13 |
-
from langchain.chains import ConversationalRetrievalChain
|
14 |
from langchain.chat_models import ChatOpenAI
|
15 |
from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
|
16 |
from langchain.schema import LLMResult
|
17 |
-
from langchain.vectorstores import VectorStore
|
18 |
-
from langchain.vectorstores.base import VectorStore
|
19 |
from transformers import (
|
20 |
AutoConfig,
|
21 |
AutoModelForCausalLM,
|
@@ -89,6 +83,7 @@ class LLMLoader:
|
|
89 |
llm: any
|
90 |
streamer: any
|
91 |
max_tokens_limit: int
|
|
|
92 |
|
93 |
def __init__(self, llm_model_type, lc_serve: bool = False):
|
94 |
self.llm_model_type = llm_model_type
|
@@ -96,6 +91,7 @@ class LLMLoader:
|
|
96 |
self.streamer = None if lc_serve else TextIteratorStreamer("")
|
97 |
self.max_tokens_limit = 2048
|
98 |
self.search_kwargs = {"k": 4}
|
|
|
99 |
|
100 |
def _init_streamer(self, tokenizer, custom_handler):
|
101 |
self.streamer = (
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
import threading
|
|
|
4 |
from queue import Queue
|
|
|
5 |
from typing import Any, Optional
|
6 |
|
7 |
import torch
|
8 |
from langchain.callbacks.base import BaseCallbackHandler
|
9 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
|
|
|
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
|
12 |
from langchain.schema import LLMResult
|
|
|
|
|
13 |
from transformers import (
|
14 |
AutoConfig,
|
15 |
AutoModelForCausalLM,
|
|
|
83 |
llm: any
|
84 |
streamer: any
|
85 |
max_tokens_limit: int
|
86 |
+
lock: any
|
87 |
|
88 |
def __init__(self, llm_model_type, lc_serve: bool = False):
|
89 |
self.llm_model_type = llm_model_type
|
|
|
91 |
self.streamer = None if lc_serve else TextIteratorStreamer("")
|
92 |
self.max_tokens_limit = 2048
|
93 |
self.search_kwargs = {"k": 4}
|
94 |
+
lock = threading.Lock()
|
95 |
|
96 |
def _init_streamer(self, tokenizer, custom_handler):
|
97 |
self.streamer = (
|