inflaton commited on
Commit
95d2e5f
1 Parent(s): 4f5127e

support concurrent access

Browse files
app_modules/llm_chat_chain.py CHANGED
@@ -12,11 +12,11 @@ def get_llama_2_prompt_template():
12
  B_INST, E_INST = "[INST]", "[/INST]"
13
  B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
14
 
15
- instruction = "Chat History:\n\n{chat_history} \n\nUser: {question}\nAnswer in markdown:"
16
- # system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. Read the chat history to get context"
17
- system_prompt = """\
18
- You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\nDo not output any emotional expression. Read the chat history to get context.\
19
- """
20
 
21
  SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
22
  prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
 
12
  B_INST, E_INST = "[INST]", "[/INST]"
13
  B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
14
 
15
+ instruction = "Chat History:\n\n{chat_history} \n\nUser: {question}"
16
+ system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. Read the chat history to get context"
17
+ # system_prompt = """\
18
+ # You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\nDo not output any emotional expression. Read the chat history to get context.\
19
+ # """
20
 
21
  SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
22
  prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
app_modules/llm_inference.py CHANGED
@@ -46,7 +46,7 @@ class LLMInference(metaclass=abc.ABCMeta):
46
 
47
  chain = self.get_chain(tracing)
48
  result = (
49
- self._run_qa_chain(
50
  chain,
51
  inputs,
52
  streaming_handler,
@@ -68,12 +68,20 @@ class LLMInference(metaclass=abc.ABCMeta):
68
 
69
  return result
70
 
71
- def _run_qa_chain(self, qa, inputs, streaming_handler):
 
 
 
 
 
 
 
 
72
  que = Queue()
73
 
74
  t = Thread(
75
- target=lambda qa, inputs, q, sh: q.put(qa(inputs, callbacks=[sh])),
76
- args=(qa, inputs, que, streaming_handler),
77
  )
78
  t.start()
79
 
 
46
 
47
  chain = self.get_chain(tracing)
48
  result = (
49
+ self._run_chain(
50
  chain,
51
  inputs,
52
  streaming_handler,
 
68
 
69
  return result
70
 
71
+ def _execute_chain(self, chain, inputs, q, sh):
72
+ self.llm_loader.lock.acquire()
73
+ try:
74
+ q.put(chain(inputs, callbacks=[sh]))
75
+ finally:
76
+ # Release the lock
77
+ self.llm_loader.lock.release()
78
+
79
+ def _run_chain(self, chain, inputs, streaming_handler):
80
  que = Queue()
81
 
82
  t = Thread(
83
+ target=self._execute_chain,
84
+ args=(chain, inputs, que, streaming_handler),
85
  )
86
  t.start()
87
 
app_modules/llm_loader.py CHANGED
@@ -1,21 +1,15 @@
1
  import os
2
  import sys
3
- import time
4
- import urllib
5
  from queue import Queue
6
- from threading import Thread
7
  from typing import Any, Optional
8
 
9
  import torch
10
  from langchain.callbacks.base import BaseCallbackHandler
11
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
- from langchain.callbacks.tracers import LangChainTracer
13
- from langchain.chains import ConversationalRetrievalChain
14
  from langchain.chat_models import ChatOpenAI
15
  from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
16
  from langchain.schema import LLMResult
17
- from langchain.vectorstores import VectorStore
18
- from langchain.vectorstores.base import VectorStore
19
  from transformers import (
20
  AutoConfig,
21
  AutoModelForCausalLM,
@@ -89,6 +83,7 @@ class LLMLoader:
89
  llm: any
90
  streamer: any
91
  max_tokens_limit: int
 
92
 
93
  def __init__(self, llm_model_type, lc_serve: bool = False):
94
  self.llm_model_type = llm_model_type
@@ -96,6 +91,7 @@ class LLMLoader:
96
  self.streamer = None if lc_serve else TextIteratorStreamer("")
97
  self.max_tokens_limit = 2048
98
  self.search_kwargs = {"k": 4}
 
99
 
100
  def _init_streamer(self, tokenizer, custom_handler):
101
  self.streamer = (
 
1
  import os
2
  import sys
3
+ import threading
 
4
  from queue import Queue
 
5
  from typing import Any, Optional
6
 
7
  import torch
8
  from langchain.callbacks.base import BaseCallbackHandler
9
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 
 
10
  from langchain.chat_models import ChatOpenAI
11
  from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
12
  from langchain.schema import LLMResult
 
 
13
  from transformers import (
14
  AutoConfig,
15
  AutoModelForCausalLM,
 
83
  llm: any
84
  streamer: any
85
  max_tokens_limit: int
86
+ lock: any
87
 
88
  def __init__(self, llm_model_type, lc_serve: bool = False):
89
  self.llm_model_type = llm_model_type
 
91
  self.streamer = None if lc_serve else TextIteratorStreamer("")
92
  self.max_tokens_limit = 2048
93
  self.search_kwargs = {"k": 4}
94
+ lock = threading.Lock()
95
 
96
  def _init_streamer(self, tokenizer, custom_handler):
97
  self.streamer = (