Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
•
0acdcc9
1
Parent(s):
cb39675
Update
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: llama2-7b-chat-uncensored-ggml
|
3 |
emoji: 🚀
|
4 |
colorFrom: green
|
5 |
colorTo: green
|
|
|
1 |
---
|
2 |
+
title: langchain-llama2-7b-chat-uncensored-ggml
|
3 |
emoji: 🚀
|
4 |
colorFrom: green
|
5 |
colorTo: green
|
app.py
CHANGED
@@ -1,21 +1,34 @@
|
|
1 |
"""Run codes."""
|
2 |
# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
|
3 |
# ruff: noqa: E501
|
|
|
4 |
import os
|
5 |
import platform
|
6 |
import random
|
7 |
import time
|
8 |
-
from dataclasses import asdict, dataclass
|
9 |
from pathlib import Path
|
|
|
|
|
10 |
|
11 |
# from types import SimpleNamespace
|
12 |
import gradio as gr
|
13 |
import psutil
|
14 |
from about_time import about_time
|
15 |
-
from ctransformers import
|
16 |
from dl_hf_model import dl_hf_model
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
from loguru import logger
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
filename_list = [
|
20 |
"Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
|
21 |
"Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
|
@@ -121,6 +134,7 @@ cpu_count: int = int(_) if _ else 1
|
|
121 |
logger.debug(f"{cpu_count=}")
|
122 |
|
123 |
LLM = None
|
|
|
124 |
|
125 |
try:
|
126 |
model_loc, file_size = dl_hf_model(url)
|
@@ -128,10 +142,15 @@ except Exception as exc_:
|
|
128 |
logger.error(exc_)
|
129 |
raise SystemExit(1) from exc_
|
130 |
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
133 |
model_type="llama",
|
134 |
-
|
|
|
135 |
)
|
136 |
|
137 |
logger.info(f"done load llm {model_loc=} {file_size=}G")
|
@@ -143,45 +162,33 @@ except Exception:
|
|
143 |
# Windows
|
144 |
logger.warning("Windows, cant run time.tzset()")
|
145 |
|
146 |
-
_ = """
|
147 |
-
ns = SimpleNamespace(
|
148 |
-
response="",
|
149 |
-
generator=(_ for _ in []),
|
150 |
-
)
|
151 |
-
# """
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
llm=LLM,
|
170 |
-
config: GenerationConfig = GenerationConfig(),
|
171 |
-
):
|
172 |
-
"""Run model inference, will return a Generator if streaming is true."""
|
173 |
-
# _ = prompt_template.format(question=question)
|
174 |
-
# print(_)
|
175 |
-
|
176 |
-
prompt = prompt_template.format(question=question)
|
177 |
-
|
178 |
-
return llm(
|
179 |
-
prompt,
|
180 |
-
**asdict(config),
|
181 |
-
)
|
182 |
|
|
|
|
|
|
|
183 |
|
184 |
-
|
|
|
|
|
|
|
|
|
185 |
|
186 |
|
187 |
def user(user_message, history):
|
@@ -223,7 +230,7 @@ def bot(history):
|
|
223 |
|
224 |
logger.debug("about to generate")
|
225 |
|
226 |
-
config =
|
227 |
for elm in generate(user_message, config=config):
|
228 |
if flag == 1:
|
229 |
logger.debug("in the loop")
|
@@ -251,7 +258,7 @@ def predict_api(prompt):
|
|
251 |
logger.debug(f"{prompt=}")
|
252 |
try:
|
253 |
# user_prompt = prompt
|
254 |
-
config =
|
255 |
temperature=0.2,
|
256 |
top_k=10,
|
257 |
top_p=0.9,
|
@@ -467,4 +474,4 @@ else:
|
|
467 |
concurrency_count = 1
|
468 |
logger.info(f"{concurrency_count=}")
|
469 |
|
470 |
-
block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)
|
|
|
1 |
"""Run codes."""
|
2 |
# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
|
3 |
# ruff: noqa: E501
|
4 |
+
import gc
|
5 |
import os
|
6 |
import platform
|
7 |
import random
|
8 |
import time
|
|
|
9 |
from pathlib import Path
|
10 |
+
from queue import deque
|
11 |
+
from typing import Any, Dict, List, Union
|
12 |
|
13 |
# from types import SimpleNamespace
|
14 |
import gradio as gr
|
15 |
import psutil
|
16 |
from about_time import about_time
|
17 |
+
from ctransformers import Config
|
18 |
from dl_hf_model import dl_hf_model
|
19 |
+
from langchain.callbacks.base import BaseCallbackHandler
|
20 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
21 |
+
|
22 |
+
# from ctransformers import AutoModelForCausalLM
|
23 |
+
from langchain.llms import CTransformers
|
24 |
+
from langchain.schema import LLMResult
|
25 |
from loguru import logger
|
26 |
|
27 |
+
deq = deque()
|
28 |
+
sig_end = object() # signals the processing is done
|
29 |
+
|
30 |
+
# from langchain.llms import OpenAI
|
31 |
+
|
32 |
filename_list = [
|
33 |
"Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
|
34 |
"Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
|
|
|
134 |
logger.debug(f"{cpu_count=}")
|
135 |
|
136 |
LLM = None
|
137 |
+
gc.collect()
|
138 |
|
139 |
try:
|
140 |
model_loc, file_size = dl_hf_model(url)
|
|
|
142 |
logger.error(exc_)
|
143 |
raise SystemExit(1) from exc_
|
144 |
|
145 |
+
config = Config()
|
146 |
+
config.steam = True
|
147 |
+
|
148 |
+
# LLM = AutoModelForCausalLM.from_pretrained(
|
149 |
+
LLM = CTransformers(
|
150 |
+
model=model_loc,
|
151 |
model_type="llama",
|
152 |
+
threads=cpu_count,
|
153 |
+
callbacks=[StreamingStdOutCallbackHandler()],
|
154 |
)
|
155 |
|
156 |
logger.info(f"done load llm {model_loc=} {file_size=}G")
|
|
|
162 |
# Windows
|
163 |
logger.warning("Windows, cant run time.tzset()")
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
class DequeCallbackHandler(BaseCallbackHandler):
|
167 |
+
"""Mediate gradio and stream output."""
|
168 |
+
|
169 |
+
def __init__(self, deq: deque):
|
170 |
+
"""Init deque for FIFO, may need to upgrade to queue.Queue or queue.SimpleQueue."""
|
171 |
+
self.q = deq
|
172 |
+
|
173 |
+
def on_llm_start(
|
174 |
+
self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
|
175 |
+
) -> None:
|
176 |
+
"""Run when LLM starts running. Clean the queue."""
|
177 |
+
self.q.clear()
|
178 |
+
|
179 |
+
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
|
180 |
+
"""Run on new LLM token. Only available when streaming is enabled."""
|
181 |
+
self.q.append(token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
+
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
|
184 |
+
"""Run when LLM ends running."""
|
185 |
+
self.q.append(sig_end)
|
186 |
|
187 |
+
def on_llm_error(
|
188 |
+
self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
|
189 |
+
) -> None:
|
190 |
+
"""Run when LLM errors."""
|
191 |
+
self.q.put(sig_end)
|
192 |
|
193 |
|
194 |
def user(user_message, history):
|
|
|
230 |
|
231 |
logger.debug("about to generate")
|
232 |
|
233 |
+
config = Config(reset=True)
|
234 |
for elm in generate(user_message, config=config):
|
235 |
if flag == 1:
|
236 |
logger.debug("in the loop")
|
|
|
258 |
logger.debug(f"{prompt=}")
|
259 |
try:
|
260 |
# user_prompt = prompt
|
261 |
+
config = Config(
|
262 |
temperature=0.2,
|
263 |
top_k=10,
|
264 |
top_p=0.9,
|
|
|
474 |
concurrency_count = 1
|
475 |
logger.info(f"{concurrency_count=}")
|
476 |
|
477 |
+
# block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)
|