jackangel commited on
Commit
b098a17
1 Parent(s): e95a04c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -388
app.py CHANGED
@@ -1,398 +1,41 @@
1
- # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
2
- # ruff: noqa: E501
3
- import gc
4
- import os
5
- import platform
6
- import random
7
- import time
8
- from dataclasses import asdict, dataclass
9
- from pathlib import Path
10
- from typing import Optional, Sequence
11
-
12
- # from types import SimpleNamespace
13
  import gradio as gr
14
- import psutil
15
- from about_time import about_time
16
- from ctransformers import AutoModelForCausalLM
17
- from examples_list import examples_list
18
- from loguru import logger
19
-
20
- LLM = None
21
- gc.collect()
22
-
23
- # raise SystemExit(0)
24
-
25
- # Prompt template: Guanaco
26
- # {past_history}
27
- prompt_template = """You are a helpful assistant. Let's think step by step.
28
- ### Human:
29
- {question}
30
- ### Assistant:"""
31
-
32
- # Prompt template: garage-bAInd/Stable-Platypus2-13B
33
- prompt_template = """
34
- ### System:
35
- This is a system prompt, please behave and help the user.
36
-
37
- ### Instruction:
38
-
39
- {question}
40
-
41
- ### Response:
42
- """
43
- prompt_template = """
44
- [INST] Write code to solve the following coding problem that obeys the constraints and
45
- passes the example test cases. Please wrap your code answer using ```:
46
 
47
- {question}
48
- [/INST]
49
- """
 
50
 
51
- human_prefix = "### Instruction"
52
- ai_prefix = "### Response"
53
- stop_list = [f"{human_prefix}:"]
54
 
55
  _ = psutil.cpu_count(logical=False) - 1
56
  cpu_count: int = int(_) if _ else 1
57
  logger.debug(f"{cpu_count=}")
58
 
59
- LLM = AutoModelForCausalLM.from_pretrained(
60
- "jackangel/LLama_3_Instruct_SPFx_Docs_Unsloth",
61
- model_type="llama",
62
- threads=cpu_count,
63
- )
64
-
65
- os.environ["TZ"] = "Asia/Shanghai"
66
- try:
67
- time.tzset() # type: ignore # pylint: disable=no-member
68
- except Exception:
69
- # Windows
70
- logger.warning("Windows, cant run time.tzset()")
71
-
72
-
73
- # ctransformers.Config() default
74
- # Config(top_k=40, top_p=0.95, temperature=0.8,
75
- # repetition_penalty=1.1, last_n_tokens=64, seed=-1,
76
- # batch_size=8, threads=-1, max_new_tokens=256,
77
- # stop=None, stream=False, reset=True,
78
- # context_length=-1, gpu_layers=0)
79
- @dataclass
80
- class GenerationConfig:
81
- temperature: float = 0.7
82
- top_k: int = 50
83
- top_p: float = 0.9
84
- repetition_penalty: float = 1.0
85
- max_new_tokens: int = 512
86
- seed: int = 42
87
- reset: bool = False
88
- stream: bool = True
89
- threads: int = cpu_count
90
- # stop: list[str] = field(default_factory=lambda: stop_list)
91
-
92
- # ctransformers\llm.py
93
- @dataclass
94
- class Config:
95
- # sample
96
- top_k: int = 40
97
- top_p: float = 0.95
98
- temperature: float = 0.8
99
- repetition_penalty: float = 1.1
100
- last_n_tokens: int = 64
101
- seed: int = -1
102
-
103
- # eval
104
- batch_size: int = 8
105
- threads: int = -1
106
-
107
- # generate
108
- max_new_tokens: int = 512 # 256
109
- stop: Optional[Sequence[str]] = None
110
- stream: bool = True # False
111
- reset: bool = False # True
112
-
113
- # model
114
- # context_length: int = -1
115
- # gpu_layers: int = 0
116
-
117
-
118
- def generate(
119
- question: str,
120
- llm=LLM,
121
- # config: GenerationConfig = GenerationConfig(),
122
- config: Config = Config(),
123
- ):
124
- """Run model inference, will return a Generator if streaming is true."""
125
- # _ = prompt_template.format(question=question)
126
- # print(_)
127
-
128
- prompt = prompt_template.format(question=question)
129
-
130
- return llm(
131
- prompt,
132
- **asdict(config),
133
- # **vars(config),
134
- )
135
-
136
-
137
- # logger.debug(f"{asdict(GenerationConfig())=}")
138
- logger.debug(f"{Config(stream=True)=}")
139
- logger.debug(f"{vars(Config(stream=True))=}")
140
-
141
-
142
- def user(user_message, history):
143
- # return user_message, history + [[user_message, None]]
144
- if history is None:
145
- history = []
146
- history.append([user_message, None])
147
- return user_message, history # keep user_message
148
-
149
-
150
- def user1(user_message, history):
151
- # return user_message, history + [[user_message, None]]
152
- if history is None:
153
- history = []
154
- history.append([user_message, None])
155
- return "", history # clear user_message
156
-
157
-
158
- def bot_(history):
159
- user_message = history[-1][0]
160
- resp = random.choice(["How are you?", "I love you", "I'm very hungry"])
161
- bot_message = user_message + ": " + resp
162
- history[-1][1] = ""
163
- for character in bot_message:
164
- history[-1][1] += character
165
- time.sleep(0.02)
166
- yield history
167
-
168
- history[-1][1] = resp
169
- yield history
170
-
171
-
172
- def bot(history):
173
- user_message = ""
174
- try:
175
- user_message = history[-1][0]
176
- except Exception as exc:
177
- logger.error(exc)
178
- response = []
179
-
180
- logger.debug(f"{user_message=}")
181
-
182
- with about_time() as atime: # type: ignore
183
- flag = 1
184
- prefix = ""
185
- then = time.time()
186
-
187
- logger.debug("about to generate")
188
-
189
- config = GenerationConfig(reset=True)
190
- for elm in generate(user_message, config=config):
191
- if flag == 1:
192
- logger.debug("in the loop")
193
- prefix = f"({time.time() - then:.2f}s) "
194
- flag = 0
195
- print(prefix, end="", flush=True)
196
- logger.debug(f"{prefix=}")
197
- print(elm, end="", flush=True)
198
- # logger.debug(f"{elm}")
199
-
200
- response.append(elm)
201
- history[-1][1] = prefix + "".join(response)
202
- yield history
203
-
204
- _ = (
205
- f"(time elapsed: {atime.duration_human}, " # type: ignore
206
- f"{atime.duration/len(''.join(response)):.2f}s/char)" # type: ignore
207
- )
208
-
209
- history[-1][1] = "".join(response) + f"\n{_}"
210
- yield history
211
-
212
-
213
- def predict_api(prompt):
214
- logger.debug(f"{prompt=}")
215
- try:
216
- # user_prompt = prompt
217
- config = GenerationConfig(
218
- temperature=0.2,
219
- top_k=10,
220
- top_p=0.9,
221
- repetition_penalty=1.0,
222
- max_new_tokens=512, # adjust as needed
223
- seed=42,
224
- reset=True, # reset history (cache)
225
- stream=False,
226
- # threads=cpu_count,
227
- # stop=prompt_prefix[1:2],
228
- )
229
-
230
- response = generate(
231
- prompt,
232
- config=config,
233
- )
234
-
235
- logger.debug(f"api: {response=}")
236
- except Exception as exc:
237
- logger.error(exc)
238
- response = f"{exc=}"
239
- # bot = {"inputs": [response]}
240
- # bot = [(prompt, response)]
241
-
242
- return response
243
-
244
-
245
- css = """
246
- .importantButton {
247
- background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
248
- border: none !important;
249
- }
250
- .importantButton:hover {
251
- background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
252
- border: none !important;
253
- }
254
- .disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
255
- .xsmall {font-size: x-small;}
256
- """
257
-
258
- logger.info("start block")
259
-
260
- with gr.Blocks(
261
- title=f"{Path(model_loc).name}",
262
- # theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
263
- theme=gr.themes.Glass(text_size="sm", spacing_size="sm"),
264
- css=css,
265
- ) as block:
266
- # buff_var = gr.State("")
267
- with gr.Accordion("🎈 Info", open=False):
268
- gr.Markdown(
269
- f"""<h5><center>{Path(model_loc).name}</center></h4>
270
- Doesn't quite work -- no output or run forever.""",
271
- elem_classes="xsmall",
272
- )
273
-
274
- # chatbot = gr.Chatbot().style(height=700) # 500
275
- chatbot = gr.Chatbot(height=500)
276
-
277
- # buff = gr.Textbox(show_label=False, visible=True)
278
-
279
- with gr.Row():
280
- with gr.Column(scale=5):
281
- msg = gr.Textbox(
282
- label="Chat Message Box",
283
- placeholder="Ask me anything (press Shift+Enter or click Submit to send)",
284
- show_label=False,
285
- # container=False,
286
- lines=6,
287
- max_lines=30,
288
- show_copy_button=True,
289
- # ).style(container=False)
290
- )
291
- with gr.Column(scale=1, min_width=50):
292
- with gr.Row():
293
- submit = gr.Button("Submit", elem_classes="xsmall")
294
- stop = gr.Button("Stop", visible=True)
295
- clear = gr.Button("Clear History", visible=True)
296
- with gr.Row(visible=False):
297
- with gr.Accordion("Advanced Options:", open=False):
298
- with gr.Row():
299
- with gr.Column(scale=2):
300
- system = gr.Textbox(
301
- label="System Prompt",
302
- value=prompt_template,
303
- show_label=False,
304
- container=False,
305
- # ).style(container=False)
306
- )
307
- with gr.Column():
308
- with gr.Row():
309
- change = gr.Button("Change System Prompt")
310
- reset = gr.Button("Reset System Prompt")
311
-
312
- with gr.Accordion("Example Inputs", open=True):
313
- examples = gr.Examples(
314
- examples=examples_list,
315
- inputs=[msg],
316
- examples_per_page=40,
317
- )
318
-
319
- # with gr.Row():
320
- with gr.Accordion("Disclaimer", open=False):
321
- _ = Path(model_loc).name
322
- gr.Markdown(
323
- f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
324
- f"factually accurate information. {_} was trained on various public datasets; while great efforts "
325
- "have been taken to clean the pretraining data, it is possible that this model could generate lewd, "
326
- "biased, or otherwise offensive outputs.",
327
- elem_classes=["disclaimer"],
328
- )
329
-
330
- msg_submit_event = msg.submit(
331
- # fn=conversation.user_turn,
332
- fn=user,
333
- inputs=[msg, chatbot],
334
- outputs=[msg, chatbot],
335
- queue=True,
336
- show_progress="full",
337
- # api_name=None,
338
- ).then(bot, chatbot, chatbot, queue=True)
339
- submit_click_event = submit.click(
340
- # fn=lambda x, y: ("",) + user(x, y)[1:], # clear msg
341
- fn=user1, # clear msg
342
- inputs=[msg, chatbot],
343
- outputs=[msg, chatbot],
344
- queue=True,
345
- # queue=False,
346
- show_progress="full",
347
- # api_name=None,
348
- ).then(bot, chatbot, chatbot, queue=True)
349
- stop.click(
350
- fn=None,
351
- inputs=None,
352
- outputs=None,
353
- cancels=[msg_submit_event, submit_click_event],
354
- queue=False,
355
- )
356
- clear.click(lambda: None, None, chatbot, queue=False)
357
-
358
- with gr.Accordion("For Chat/Translation API", open=False, visible=False):
359
- input_text = gr.Text()
360
- api_btn = gr.Button("Go", variant="primary")
361
- out_text = gr.Text()
362
-
363
- api_btn.click(
364
- predict_api,
365
- input_text,
366
- out_text,
367
- api_name="api",
368
- )
369
-
370
- # block.load(update_buff, [], buff, every=1)
371
- # block.load(update_buff, [buff_var], [buff_var, buff], every=1)
372
-
373
- # concurrency_count=5, max_size=20
374
- # max_size=36, concurrency_count=14
375
- # CPU cpu_count=2 16G, model 7G
376
- # CPU UPGRADE cpu_count=8 32G, model 7G
377
-
378
- # does not work
379
- _ = """
380
- # _ = int(psutil.virtual_memory().total / 10**9 // file_size - 1)
381
- # concurrency_count = max(_, 1)
382
- if psutil.cpu_count(logical=False) >= 8:
383
- # concurrency_count = max(int(32 / file_size) - 1, 1)
384
- else:
385
- # concurrency_count = max(int(16 / file_size) - 1, 1)
386
- # """
387
-
388
- # default concurrency_count = 1
389
- # block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)
390
-
391
- server_port = 7860
392
- if "forindo" in platform.node():
393
- server_port = 7861
394
- block.queue(max_size=5).launch(
395
- debug=True, server_name="0.0.0.0", server_port=server_port
396
  )
397
 
398
- # block.queue(max_size=5).launch(debug=True, server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ ## Download the GGUF model
6
+ model_name = "jackangel/LLama_3_Instruct_SPFx_Docs_Unsloth"
7
+ model_file = "Llama_3_Instruct_SPFx_Docs_Unsloth.Q4_K_M.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
8
+ model_path = hf_hub_download(model_name, filename=model_file)
9
 
 
 
 
10
 
11
  _ = psutil.cpu_count(logical=False) - 1
12
  cpu_count: int = int(_) if _ else 1
13
  logger.debug(f"{cpu_count=}")
14
 
15
+ ## Instantiate model from downloaded file
16
+ llm = Llama(
17
+ model_path=model_path,
18
+ n_ctx=1024, # Context length to use
19
+ n_threads=cpu_count, # Number of CPU threads to use
20
+ n_gpu_layers=0 # Number of model layers to offload to GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  )
22
 
23
+ ## Generation kwargs
24
+ generation_kwargs = {
25
+ "max_tokens":1000,
26
+ "stop":["</s>"],
27
+ "echo":False, # Echo the prompt in the output
28
+ "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
29
+ }
30
+
31
+
32
+ def chatbot(message, history):
33
+ prompt = "INSTRUCTION: You are a helpful assistant\nINPUT: " + message + "\nOUTPUT:"
34
+ airemember = ""
35
+ for human,assistant in history:
36
+
37
+ airemember += "USER: " + human + "\nASSISTANT:" + assistant+"\n\n"
38
+ sendtoai = airemember + prompt
39
+ result = llm(sendtoai, **generation_kwargs)
40
+ return result
41
+ app = gr.ChatInterface(chatbot)