ffreemt commited on
Commit
0acdcc9
1 Parent(s): cb39675
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +51 -44
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: llama2-7b-chat-uncensored-ggml
3
  emoji: 🚀
4
  colorFrom: green
5
  colorTo: green
 
1
  ---
2
+ title: langchain-llama2-7b-chat-uncensored-ggml
3
  emoji: 🚀
4
  colorFrom: green
5
  colorTo: green
app.py CHANGED
@@ -1,21 +1,34 @@
1
  """Run codes."""
2
  # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
  # ruff: noqa: E501
 
4
  import os
5
  import platform
6
  import random
7
  import time
8
- from dataclasses import asdict, dataclass
9
  from pathlib import Path
 
 
10
 
11
  # from types import SimpleNamespace
12
  import gradio as gr
13
  import psutil
14
  from about_time import about_time
15
- from ctransformers import AutoModelForCausalLM
16
  from dl_hf_model import dl_hf_model
 
 
 
 
 
 
17
  from loguru import logger
18
 
 
 
 
 
 
19
  filename_list = [
20
  "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
21
  "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
@@ -121,6 +134,7 @@ cpu_count: int = int(_) if _ else 1
121
  logger.debug(f"{cpu_count=}")
122
 
123
  LLM = None
 
124
 
125
  try:
126
  model_loc, file_size = dl_hf_model(url)
@@ -128,10 +142,15 @@ except Exception as exc_:
128
  logger.error(exc_)
129
  raise SystemExit(1) from exc_
130
 
131
- LLM = AutoModelForCausalLM.from_pretrained(
132
- model_loc,
 
 
 
 
133
  model_type="llama",
134
- # threads=cpu_count,
 
135
  )
136
 
137
  logger.info(f"done load llm {model_loc=} {file_size=}G")
@@ -143,45 +162,33 @@ except Exception:
143
  # Windows
144
  logger.warning("Windows, cant run time.tzset()")
145
 
146
- _ = """
147
- ns = SimpleNamespace(
148
- response="",
149
- generator=(_ for _ in []),
150
- )
151
- # """
152
 
153
- @dataclass
154
- class GenerationConfig:
155
- temperature: float = 0.7
156
- top_k: int = 50
157
- top_p: float = 0.9
158
- repetition_penalty: float = 1.0
159
- max_new_tokens: int = 512
160
- seed: int = 42
161
- reset: bool = False
162
- stream: bool = True
163
- # threads: int = cpu_count
164
- # stop: list[str] = field(default_factory=lambda: [stop_string])
165
-
166
-
167
- def generate(
168
- question: str,
169
- llm=LLM,
170
- config: GenerationConfig = GenerationConfig(),
171
- ):
172
- """Run model inference, will return a Generator if streaming is true."""
173
- # _ = prompt_template.format(question=question)
174
- # print(_)
175
-
176
- prompt = prompt_template.format(question=question)
177
-
178
- return llm(
179
- prompt,
180
- **asdict(config),
181
- )
182
 
 
 
 
183
 
184
- logger.debug(f"{asdict(GenerationConfig())=}")
 
 
 
 
185
 
186
 
187
  def user(user_message, history):
@@ -223,7 +230,7 @@ def bot(history):
223
 
224
  logger.debug("about to generate")
225
 
226
- config = GenerationConfig(reset=True)
227
  for elm in generate(user_message, config=config):
228
  if flag == 1:
229
  logger.debug("in the loop")
@@ -251,7 +258,7 @@ def predict_api(prompt):
251
  logger.debug(f"{prompt=}")
252
  try:
253
  # user_prompt = prompt
254
- config = GenerationConfig(
255
  temperature=0.2,
256
  top_k=10,
257
  top_p=0.9,
@@ -467,4 +474,4 @@ else:
467
  concurrency_count = 1
468
  logger.info(f"{concurrency_count=}")
469
 
470
- block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)
 
1
  """Run codes."""
2
  # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
  # ruff: noqa: E501
4
+ import gc
5
  import os
6
  import platform
7
  import random
8
  import time
 
9
  from pathlib import Path
10
+ from queue import deque
11
+ from typing import Any, Dict, List, Union
12
 
13
  # from types import SimpleNamespace
14
  import gradio as gr
15
  import psutil
16
  from about_time import about_time
17
+ from ctransformers import Config
18
  from dl_hf_model import dl_hf_model
19
+ from langchain.callbacks.base import BaseCallbackHandler
20
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
21
+
22
+ # from ctransformers import AutoModelForCausalLM
23
+ from langchain.llms import CTransformers
24
+ from langchain.schema import LLMResult
25
  from loguru import logger
26
 
27
+ deq = deque()
28
+ sig_end = object() # signals the processing is done
29
+
30
+ # from langchain.llms import OpenAI
31
+
32
  filename_list = [
33
  "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
34
  "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
 
134
  logger.debug(f"{cpu_count=}")
135
 
136
  LLM = None
137
+ gc.collect()
138
 
139
  try:
140
  model_loc, file_size = dl_hf_model(url)
 
142
  logger.error(exc_)
143
  raise SystemExit(1) from exc_
144
 
145
+ config = Config()
146
+ config.steam = True
147
+
148
+ # LLM = AutoModelForCausalLM.from_pretrained(
149
+ LLM = CTransformers(
150
+ model=model_loc,
151
  model_type="llama",
152
+ threads=cpu_count,
153
+ callbacks=[StreamingStdOutCallbackHandler()],
154
  )
155
 
156
  logger.info(f"done load llm {model_loc=} {file_size=}G")
 
162
  # Windows
163
  logger.warning("Windows, cant run time.tzset()")
164
 
 
 
 
 
 
 
165
 
166
+ class DequeCallbackHandler(BaseCallbackHandler):
167
+ """Mediate gradio and stream output."""
168
+
169
+ def __init__(self, deq: deque):
170
+ """Init deque for FIFO, may need to upgrade to queue.Queue or queue.SimpleQueue."""
171
+ self.q = deq
172
+
173
+ def on_llm_start(
174
+ self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
175
+ ) -> None:
176
+ """Run when LLM starts running. Clean the queue."""
177
+ self.q.clear()
178
+
179
+ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
180
+ """Run on new LLM token. Only available when streaming is enabled."""
181
+ self.q.append(token)
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
184
+ """Run when LLM ends running."""
185
+ self.q.append(sig_end)
186
 
187
+ def on_llm_error(
188
+ self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
189
+ ) -> None:
190
+ """Run when LLM errors."""
191
+ self.q.put(sig_end)
192
 
193
 
194
  def user(user_message, history):
 
230
 
231
  logger.debug("about to generate")
232
 
233
+ config = Config(reset=True)
234
  for elm in generate(user_message, config=config):
235
  if flag == 1:
236
  logger.debug("in the loop")
 
258
  logger.debug(f"{prompt=}")
259
  try:
260
  # user_prompt = prompt
261
+ config = Config(
262
  temperature=0.2,
263
  top_k=10,
264
  top_p=0.9,
 
474
  concurrency_count = 1
475
  logger.info(f"{concurrency_count=}")
476
 
477
+ # block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)