learn-ai

Runtime error

inflaton commited on Aug 6, 2023

Commit

95d2e5f

•

1 Parent(s): 4f5127e

support concurrent access

Files changed (3) hide show

app_modules/llm_chat_chain.py CHANGED Viewed

@@ -12,11 +12,11 @@ def get_llama_2_prompt_template():
     B_INST, E_INST = "[INST]", "[/INST]"
     B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
-    instruction = "Chat History:\n\n{chat_history} \n\nUser: {question}\nAnswer in markdown:"
-    # system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. Read the chat history to get context"
-    system_prompt = """\
-You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\nDo not output any emotional expression. Read the chat history to get context.\
-"""
     SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
     prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST

     B_INST, E_INST = "[INST]", "[/INST]"
     B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+    instruction = "Chat History:\n\n{chat_history} \n\nUser: {question}"
+    system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. Read the chat history to get context"
+    # system_prompt = """\
+    # You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\nDo not output any emotional expression. Read the chat history to get context.\
+    # """
     SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
     prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST

app_modules/llm_inference.py CHANGED Viewed

@@ -46,7 +46,7 @@ class LLMInference(metaclass=abc.ABCMeta):
         chain = self.get_chain(tracing)
         result = (
-            self._run_qa_chain(
                 chain,
                 inputs,
                 streaming_handler,
@@ -68,12 +68,20 @@ class LLMInference(metaclass=abc.ABCMeta):
         return result
-    def _run_qa_chain(self, qa, inputs, streaming_handler):
         que = Queue()
         t = Thread(
-            target=lambda qa, inputs, q, sh: q.put(qa(inputs, callbacks=[sh])),
-            args=(qa, inputs, que, streaming_handler),
         )
         t.start()

         chain = self.get_chain(tracing)
         result = (
+            self._run_chain(
                 chain,
                 inputs,
                 streaming_handler,
         return result
+    def _execute_chain(self, chain, inputs, q, sh):
+        self.llm_loader.lock.acquire()
+        try:
+            q.put(chain(inputs, callbacks=[sh]))
+        finally:
+            # Release the lock
+            self.llm_loader.lock.release()
+    def _run_chain(self, chain, inputs, streaming_handler):
         que = Queue()
         t = Thread(
+            target=self._execute_chain,
+            args=(chain, inputs, que, streaming_handler),
         )
         t.start()

app_modules/llm_loader.py CHANGED Viewed

@@ -1,21 +1,15 @@
 import os
 import sys
-import time
-import urllib
 from queue import Queue
-from threading import Thread
 from typing import Any, Optional
 import torch
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from langchain.callbacks.tracers import LangChainTracer
-from langchain.chains import ConversationalRetrievalChain
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
 from langchain.schema import LLMResult
-from langchain.vectorstores import VectorStore
-from langchain.vectorstores.base import VectorStore
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
@@ -89,6 +83,7 @@ class LLMLoader:
     llm: any
     streamer: any
     max_tokens_limit: int
     def __init__(self, llm_model_type, lc_serve: bool = False):
         self.llm_model_type = llm_model_type
@@ -96,6 +91,7 @@ class LLMLoader:
         self.streamer = None if lc_serve else TextIteratorStreamer("")
         self.max_tokens_limit = 2048
         self.search_kwargs = {"k": 4}
     def _init_streamer(self, tokenizer, custom_handler):
         self.streamer = (

 import os
 import sys
+import threading
 from queue import Queue
 from typing import Any, Optional
 import torch
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
 from langchain.schema import LLMResult
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
     llm: any
     streamer: any
     max_tokens_limit: int
+    lock: any
     def __init__(self, llm_model_type, lc_serve: bool = False):
         self.llm_model_type = llm_model_type
         self.streamer = None if lc_serve else TextIteratorStreamer("")
         self.max_tokens_limit = 2048
         self.search_kwargs = {"k": 4}
+        lock = threading.Lock()
     def _init_streamer(self, tokenizer, custom_handler):
         self.streamer = (