langchain-llama2-7b-chat-uncensored-ggml

Runtime error

App Files Files Community

ffreemt commited on Jul 28, 2023

Commit

4180709

1 Parent(s): 6f9f106

Update

Browse files

Files changed (1) hide show

app.py +13 -19

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ from langchain.schema import LLMResult
 from loguru import logger
 deq = deque()
-sig_end = object() # signals the processing is done
 # from langchain.llms import OpenAI
@@ -206,7 +206,7 @@ config = Config()
 # Config(top_k=40, top_p=0.95, temperature=0.8, repetition_penalty=1.1, last_n_tokens=64, seed=-1, batch_size=8, threads=-1, max_new_tokens=256, stop=None, stream=False, reset=True, context_length=-1, gpu_layers=0)
 config.stream = True
 config.stop = stop
-config.threads=cpu_count
 deqcb = DequeCallbackHandler(deq)
@@ -221,14 +221,13 @@ LLM = CTransformers(
 logger.info(f"done load llm {model_loc=} {file_size=}G")
 prompt = PromptTemplate(
-    input_variables=['history', 'input'],
     output_parser=None,
     partial_variables={},
     template=prompt_template,
-    template_format='f-string',
-    validate_template=True
 )
 memory = ConversationBufferWindowMemory(
@@ -248,7 +247,7 @@ logger.debug(f"{conversation.prompt.template=}")  # type: ignore
 config = Config()
 # Config(top_k=40, top_p=0.95, temperature=0.8, repetition_penalty=1.1, last_n_tokens=64, seed=-1, batch_size=8, threads=-1, max_new_tokens=256, stop=None, stream=False, reset=True, context_length=-1, gpu_layers=0)
 config.stop = stop
-config.threads=cpu_count
 try:
     LLM_api = CTransformers(
@@ -332,7 +331,7 @@ def bot(history):
         f"{atime.duration/len(''.join(response)):.2f}s/char)"  # type: ignore
     )
-    history[-1][1] = "".join(response)  + f"\n{_}"
     yield history
@@ -373,8 +372,8 @@ css = """
 """
 etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
 examples_list = [
-        ["Hello I am mike."],
-        ["What's my name?"],
     ["What NFL team won the Super Bowl in the year Justin Bieber was born?"],
     [
         "What NFL team won the Super Bowl in the year Justin Bieber was born? Think step by step."
@@ -382,7 +381,9 @@ examples_list = [
     ["When was Justin Bieber born？"],
     ["What NFL team won the Super Bowl in 1994?"],
     ["How to pick a lock? Provide detailed steps."],
-    ["If it takes 10 hours to dry 10 clothes,  assuming all the clothes are hanged together at the same time for drying , then how long will it take to dry a cloth?"],
     ["is infinity + 1 bigger than infinity?"],
     ["Explain the plot of Cinderella in a sentence."],
     [
@@ -429,7 +430,7 @@ with gr.Blocks(
         gr.Markdown(
             f"""<h5><center>{Path(model_loc).name}</center></h4>
             The bot can conduct multi-turn conversations, i.e. it remembers past dialogs. The process time is longer.
-            It typically takes around 120 seconds for the first response to appear.
             Most examples are meant for another model.
             You probably should try to test
@@ -437,11 +438,8 @@ with gr.Blocks(
             elem_classes="xsmall",
         )
-    # chatbot = gr.Chatbot().style(height=700)  # 500
     chatbot = gr.Chatbot(height=500)
-    # buff = gr.Textbox(show_label=False, visible=True)
     with gr.Row():
         with gr.Column(scale=5):
             msg = gr.Textbox(
@@ -482,7 +480,6 @@ with gr.Blocks(
             examples_per_page=40,
         )
-    # with gr.Row():
     with gr.Accordion("Disclaimer", open=False):
         _ = Path(model_loc).name
         gr.Markdown(
@@ -536,9 +533,6 @@ with gr.Blocks(
             api_name="api",
         )
-    # block.load(update_buff, [], buff, every=1)
-    # block.load(update_buff, [buff_var], [buff_var, buff], every=1)
 # concurrency_count=5, max_size=20
 # max_size=36, concurrency_count=14
 # CPU cpu_count=2 16G, model 7G

 from loguru import logger
 deq = deque()
+sig_end = object()  # signals the processing is done
 # from langchain.llms import OpenAI
 # Config(top_k=40, top_p=0.95, temperature=0.8, repetition_penalty=1.1, last_n_tokens=64, seed=-1, batch_size=8, threads=-1, max_new_tokens=256, stop=None, stream=False, reset=True, context_length=-1, gpu_layers=0)
 config.stream = True
 config.stop = stop
+config.threads = cpu_count
 deqcb = DequeCallbackHandler(deq)
 logger.info(f"done load llm {model_loc=} {file_size=}G")
 prompt = PromptTemplate(
+    input_variables=["history", "input"],
     output_parser=None,
     partial_variables={},
     template=prompt_template,
+    template_format="f-string",
+    validate_template=True,
 )
 memory = ConversationBufferWindowMemory(
 config = Config()
 # Config(top_k=40, top_p=0.95, temperature=0.8, repetition_penalty=1.1, last_n_tokens=64, seed=-1, batch_size=8, threads=-1, max_new_tokens=256, stop=None, stream=False, reset=True, context_length=-1, gpu_layers=0)
 config.stop = stop
+config.threads = cpu_count
 try:
     LLM_api = CTransformers(
         f"{atime.duration/len(''.join(response)):.2f}s/char)"  # type: ignore
     )
+    history[-1][1] = "".join(response) + f"\n{_}"
     yield history
 """
 etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
 examples_list = [
+    ["Hello I am mike."],
+    ["What's my name?"],
     ["What NFL team won the Super Bowl in the year Justin Bieber was born?"],
     [
         "What NFL team won the Super Bowl in the year Justin Bieber was born? Think step by step."
     ["When was Justin Bieber born？"],
     ["What NFL team won the Super Bowl in 1994?"],
     ["How to pick a lock? Provide detailed steps."],
+    [
+        "If it takes 10 hours to dry 10 clothes,  assuming all the clothes are hanged together at the same time for drying , then how long will it take to dry a cloth?"
+    ],
     ["is infinity + 1 bigger than infinity?"],
     ["Explain the plot of Cinderella in a sentence."],
     [
         gr.Markdown(
             f"""<h5><center>{Path(model_loc).name}</center></h4>
             The bot can conduct multi-turn conversations, i.e. it remembers past dialogs. The process time is longer.
+            It typically takes about 120 seconds for the first response to appear.
             Most examples are meant for another model.
             You probably should try to test
             elem_classes="xsmall",
         )
     chatbot = gr.Chatbot(height=500)
     with gr.Row():
         with gr.Column(scale=5):
             msg = gr.Textbox(
             examples_per_page=40,
         )
     with gr.Accordion("Disclaimer", open=False):
         _ = Path(model_loc).name
         gr.Markdown(
             api_name="api",
         )
 # concurrency_count=5, max_size=20
 # max_size=36, concurrency_count=14
 # CPU cpu_count=2 16G, model 7G