Spaces:

s3nh
/

mamba-gpt-3b

Runtime error

App Files Files Community

s3nh commited on Jul 25, 2023

Commit

b5edb1f

•

1 Parent(s): 1c65322

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -44

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
-"""Run codes."""
-# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
-# ruff: noqa: E501
 import os
 import platform
 import random
@@ -8,7 +6,6 @@ import time
 from dataclasses import asdict, dataclass
 from pathlib import Path
-# from types import SimpleNamespace
 import gradio as gr
 import psutil
 from about_time import about_time
@@ -16,22 +13,6 @@ from ctransformers import AutoModelForCausalLM
 from dl_hf_model import dl_hf_model
 from loguru import logger
-# filename_list = [
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_M.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_S.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_1.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_M.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_S.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_0.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_1.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_K_M.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_K_S.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q6_K.bin",
-#     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q8_0.bin",
-# ]
 URL = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin"  # 4.05G
@@ -44,10 +25,7 @@ _ = (
 )
 if _:
-    # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q2_K.bin"
     url = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin"  # 2.87G
-    # url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/blob/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin"  # 2.87G
-    # url = "https://huggingface.co/TheBloke/llama2_7b_chat_uncensored-GGML/blob/main/llama2_7b_chat_uncensored.ggmlv3.q4_K_M.bin"  # 4.08G
 prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
@@ -123,16 +101,14 @@ except Exception as exc_:
 LLM = AutoModelForCausalLM.from_pretrained(
     model_loc,
     model_type="llama",
-    # threads=cpu_count,
 )
 logger.info(f"done load llm {model_loc=} {file_size=}G")
 os.environ["TZ"] = "Asia/Shanghai"
 try:
-    time.tzset()  # type: ignore # pylint: disable=no-member
-except Exception:
-    # Windows
     logger.warning("Windows, cant run time.tzset()")
 _ = """
@@ -162,8 +138,7 @@ def generate(
     config: GenerationConfig = GenerationConfig(),
 ):
     """Run model inference, will return a Generator if streaming is true."""
-    # _ = prompt_template.format(question=question)
-    # print(_)
     prompt = prompt_template.format(question=question)
@@ -177,16 +152,13 @@ logger.debug(f"{asdict(GenerationConfig())=}")
 def user(user_message, history):
-    # return user_message, history + [[user_message, None]]
     history.append([user_message, None])
-    return user_message, history  # keep user_message
 def user1(user_message, history):
-    # return user_message, history + [[user_message, None]]
     history.append([user_message, None])
-    return "", history  # clear user_message
 def bot_(history):
     user_message = history[-1][0]
@@ -208,7 +180,7 @@ def bot(history):
     logger.debug(f"{user_message=}")
-    with about_time() as atime:  # type: ignore
         flag = 1
         prefix = ""
         then = time.time()
@@ -224,15 +196,14 @@ def bot(history):
                 print(prefix, end="", flush=True)
                 logger.debug(f"{prefix=}")
             print(elm, end="", flush=True)
-            # logger.debug(f"{elm}")
             response.append(elm)
             history[-1][1] = prefix + "".join(response)
             yield history
     _ = (
-        f"(time elapsed: {atime.duration_human}, "  # type: ignore
-        f"{atime.duration/len(''.join(response)):.2f}s/char)"  # type: ignore
     )
     history[-1][1] = "".join(response)  + f"\n{_}"
@@ -250,10 +221,8 @@ def predict_api(prompt):
             repetition_penalty=1.0,
             max_new_tokens=512,  # adjust as needed
             seed=42,
-            reset=True,  # reset history (cache)
             stream=False,
-            # threads=cpu_count,
-            # stop=prompt_prefix[1:2],
         )
         response = generate(
@@ -265,9 +234,6 @@ def predict_api(prompt):
     except Exception as exc:
         logger.error(exc)
         response = f"{exc=}"
-    # bot = {"inputs": [response]}
-    # bot = [(prompt, response)]
     return response

 import os
 import platform
 import random
 from dataclasses import asdict, dataclass
 from pathlib import Path
 import gradio as gr
 import psutil
 from about_time import about_time
 from dl_hf_model import dl_hf_model
 from loguru import logger
 URL = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin"  # 4.05G
 )
 if _:
     url = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin"  # 2.87G
 prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
 LLM = AutoModelForCausalLM.from_pretrained(
     model_loc,
     model_type="llama",
 )
 logger.info(f"done load llm {model_loc=} {file_size=}G")
 os.environ["TZ"] = "Asia/Shanghai"
 try:
+    time.tzset()
     logger.warning("Windows, cant run time.tzset()")
 _ = """
     config: GenerationConfig = GenerationConfig(),
 ):
     """Run model inference, will return a Generator if streaming is true."""
     prompt = prompt_template.format(question=question)
 def user(user_message, history):
     history.append([user_message, None])
+    return user_message, history
 def user1(user_message, history):
     history.append([user_message, None])
+    return "", history
 def bot_(history):
     user_message = history[-1][0]
     logger.debug(f"{user_message=}")
+    with about_time() as atime:
         flag = 1
         prefix = ""
         then = time.time()
                 print(prefix, end="", flush=True)
                 logger.debug(f"{prefix=}")
             print(elm, end="", flush=True)
             response.append(elm)
             history[-1][1] = prefix + "".join(response)
             yield history
     _ = (
+        f"(time elapsed: {atime.duration_human}, "
+        f"{atime.duration/len(''.join(response)):.2f}s/char)"
     )
     history[-1][1] = "".join(response)  + f"\n{_}"
             repetition_penalty=1.0,
             max_new_tokens=512,  # adjust as needed
             seed=42,
+            reset=True,
             stream=False,
         )
         response = generate(
     except Exception as exc:
         logger.error(exc)
         response = f"{exc=}"
     return response