ffreemt commited on
Commit
3cae1b6
1 Parent(s): f0fb4eb

Update buff enabled

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +39 -26
  3. run-app.sh +1 -0
.gitignore CHANGED
@@ -9,3 +9,4 @@ models
9
  .ruff_cache
10
  run-nodemon.sh
11
  app-.py
 
 
9
  .ruff_cache
10
  run-nodemon.sh
11
  app-.py
12
+ nodemon.json
app.py CHANGED
@@ -3,7 +3,7 @@
3
  # ruff: noqa: E501
4
  import os
5
  import time
6
- from dataclasses import asdict, dataclass, field
7
  from pathlib import Path
8
  from types import SimpleNamespace
9
 
@@ -39,9 +39,9 @@ URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main
39
 
40
  url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
41
  url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
 
 
42
  url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
43
- url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
44
- url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" #
45
 
46
  prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
47
 
@@ -50,9 +50,6 @@ prompt_template="""Below is an instruction that describes a task. Write a respon
50
  ### Response:
51
  """
52
 
53
- prompt_template_qa = """Question: {question}
54
- Answer: Let's work this out in a step by step way to be sure we have the right answer."""
55
-
56
  prompt_template = """System: You are a helpful,
57
  respectful and honest assistant. Always answer as
58
  helpfully as possible, while being safe. Your answers
@@ -67,9 +64,17 @@ information.
67
  User: {prompt}
68
  Assistant: """
69
 
70
- stop_string = [elm.split(":")[0] + ":" for elm in prompt_template.splitlines()][-2]
 
71
 
72
- model_loc, file_size = dl_hf_model(url)
 
 
 
 
 
 
 
73
 
74
  logger.debug(f"{model_loc} {file_size}GB")
75
 
@@ -85,7 +90,7 @@ logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
85
  LLM = None
86
  LLM = AutoModelForCausalLM.from_pretrained(
87
  model_loc,
88
- model_type="llama", # "starcoder", AutoConfig.from_pretrained(REPO_ID)
89
  threads=cpu_count,
90
  )
91
 
@@ -100,7 +105,7 @@ except Exception:
100
 
101
  ns = SimpleNamespace(
102
  response="",
103
- generator=[],
104
  )
105
 
106
 
@@ -115,17 +120,17 @@ class GenerationConfig:
115
  reset: bool = False
116
  stream: bool = True
117
  threads: int = cpu_count
118
- stop: list[str] = field(default_factory=lambda: [stop_string])
119
 
120
 
121
  def generate(
122
- prompt: str,
123
- llm: AutoModelForCausalLM = LLM,
124
  generation_config: GenerationConfig = GenerationConfig(),
125
  ):
126
  """Run model inference, will return a Generator if streaming is true."""
127
  # if not user_prompt.strip():
128
- _ = prompt_template.format(prompt=prompt)
129
  print(_)
130
  return llm(
131
  _,
@@ -210,13 +215,13 @@ def predict(prompt, bot):
210
  for word in generator:
211
  # record first response time
212
  if flag:
213
- logger.debug(f"\t {time.time() - then:.1f}s")
 
214
  flag = 0
215
- # print(word, end="", flush=True)
216
- print(word, flush=True) # vertical stream
217
  response += word
218
- ns.response = response
219
- buff.update(value=response)
220
  print("")
221
  logger.debug(f"{response=}")
222
  except Exception as exc:
@@ -229,7 +234,7 @@ def predict(prompt, bot):
229
  f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
230
  )
231
 
232
- bot.append([prompt, f"{response} {_}"])
233
 
234
  return prompt, bot
235
 
@@ -247,9 +252,9 @@ def predict_api(prompt):
247
  max_new_tokens=512, # adjust as needed
248
  seed=42,
249
  reset=False, # reset history (cache)
250
- stream=True, # TODO stream=False and generator
251
  threads=cpu_count,
252
- stop=prompt_prefix[1:2],
253
  )
254
 
255
  generator = generate(
@@ -274,6 +279,10 @@ def predict_api(prompt):
274
  return response
275
 
276
 
 
 
 
 
277
  css = """
278
  .importantButton {
279
  background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
@@ -320,8 +329,9 @@ examples = [
320
  ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
321
  ]
322
 
 
 
323
  with gr.Blocks(
324
- # title="mpt-30b-chat-ggml",
325
  title=f"{Path(model_loc).name}",
326
  theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
327
  css=css,
@@ -343,7 +353,7 @@ with gr.Blocks(
343
 
344
  # chatbot = gr.Chatbot().style(height=700) # 500
345
  chatbot = gr.Chatbot(height=500)
346
- buff = gr.Textbox(show_label=False, visible=False)
347
  with gr.Row():
348
  with gr.Column(scale=5):
349
  msg = gr.Textbox(
@@ -359,12 +369,13 @@ with gr.Blocks(
359
  with gr.Row(visible=False):
360
  with gr.Accordion("Advanced Options:", open=False):
361
  with gr.Row():
362
- with gr.Column(scale=2):
363
  system = gr.Textbox(
364
  label="System Prompt",
365
  value=prompt_template,
366
  show_label=False,
367
- ).style(container=False)
 
368
  with gr.Column():
369
  with gr.Row():
370
  change = gr.Button("Change System Prompt")
@@ -445,6 +456,8 @@ with gr.Blocks(
445
  api_name="api",
446
  )
447
 
 
 
448
  # concurrency_count=5, max_size=20
449
  # max_size=36, concurrency_count=14
450
  block.queue(concurrency_count=5, max_size=20).launch(debug=True)
 
3
  # ruff: noqa: E501
4
  import os
5
  import time
6
+ from dataclasses import asdict, dataclass
7
  from pathlib import Path
8
  from types import SimpleNamespace
9
 
 
39
 
40
  url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
41
  url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
42
+ # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
43
+ # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" # 7.87G
44
  url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
 
 
45
 
46
  prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
47
 
 
50
  ### Response:
51
  """
52
 
 
 
 
53
  prompt_template = """System: You are a helpful,
54
  respectful and honest assistant. Always answer as
55
  helpfully as possible, while being safe. Your answers
 
64
  User: {prompt}
65
  Assistant: """
66
 
67
+ prompt_template = """Question: {question}
68
+ Answer: Let's work this out in a step by step way to be sure we have the right answer."""
69
 
70
+ _ = [elm for elm in prompt_template.splitlines() if elm.strip()]
71
+ stop_string = [elm.split(":")[0] + ":" for elm in _][-2]
72
+
73
+ try:
74
+ model_loc, file_size = dl_hf_model(url)
75
+ except Exception as exc_:
76
+ logger.error(exc_)
77
+ raise SystemExit(1) from exc_
78
 
79
  logger.debug(f"{model_loc} {file_size}GB")
80
 
 
90
  LLM = None
91
  LLM = AutoModelForCausalLM.from_pretrained(
92
  model_loc,
93
+ model_type="llama",
94
  threads=cpu_count,
95
  )
96
 
 
105
 
106
  ns = SimpleNamespace(
107
  response="",
108
+ generator=(_ for _ in []),
109
  )
110
 
111
 
 
120
  reset: bool = False
121
  stream: bool = True
122
  threads: int = cpu_count
123
+ # stop: list[str] = field(default_factory=lambda: [stop_string])
124
 
125
 
126
  def generate(
127
+ question: str,
128
+ llm=LLM,
129
  generation_config: GenerationConfig = GenerationConfig(),
130
  ):
131
  """Run model inference, will return a Generator if streaming is true."""
132
  # if not user_prompt.strip():
133
+ _ = prompt_template.format(question=question)
134
  print(_)
135
  return llm(
136
  _,
 
215
  for word in generator:
216
  # record first response time
217
  if flag:
218
+ fisrt_arr = f"{time.time() - then:.1f}s"
219
+ logger.debug(f"\t 1st arrival: {fisrt_arr}")
220
  flag = 0
221
+ print(word, end="", flush=True)
222
+ # print(word, flush=True) # vertical stream
223
  response += word
224
+ ns.response = f"({fisrt_arr}){response}"
 
225
  print("")
226
  logger.debug(f"{response=}")
227
  except Exception as exc:
 
234
  f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
235
  )
236
 
237
+ bot.append([prompt, f"{response} \n{_}"])
238
 
239
  return prompt, bot
240
 
 
252
  max_new_tokens=512, # adjust as needed
253
  seed=42,
254
  reset=False, # reset history (cache)
255
+ stream=True,
256
  threads=cpu_count,
257
+ # stop=prompt_prefix[1:2],
258
  )
259
 
260
  generator = generate(
 
279
  return response
280
 
281
 
282
+ def update_buff():
283
+ return ns.response
284
+
285
+
286
  css = """
287
  .importantButton {
288
  background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
 
329
  ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
330
  ]
331
 
332
+ logger.info("start block")
333
+
334
  with gr.Blocks(
 
335
  title=f"{Path(model_loc).name}",
336
  theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
337
  css=css,
 
353
 
354
  # chatbot = gr.Chatbot().style(height=700) # 500
355
  chatbot = gr.Chatbot(height=500)
356
+ buff = gr.Textbox(show_label=False, visible=True)
357
  with gr.Row():
358
  with gr.Column(scale=5):
359
  msg = gr.Textbox(
 
369
  with gr.Row(visible=False):
370
  with gr.Accordion("Advanced Options:", open=False):
371
  with gr.Row():
372
+ with gr.Column(scale=2, container=False):
373
  system = gr.Textbox(
374
  label="System Prompt",
375
  value=prompt_template,
376
  show_label=False,
377
+ # ).style(container=False)
378
+ )
379
  with gr.Column():
380
  with gr.Row():
381
  change = gr.Button("Change System Prompt")
 
456
  api_name="api",
457
  )
458
 
459
+ block.load(update_buff, [], buff, every=1)
460
+
461
  # concurrency_count=5, max_size=20
462
  # max_size=36, concurrency_count=14
463
  block.queue(concurrency_count=5, max_size=20).launch(debug=True)
run-app.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ nodemon -w . -x python app.py