ffreemt commited on
Commit
83cfba4
1 Parent(s): 918c99d
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +412 -0
  3. examples_list.py +12 -0
  4. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Codellama 13b Python Ggml
3
  emoji: 🦀
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.41.2
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
  ---
12
 
 
1
  ---
2
+ title: codellama 13b python ggml
3
  emoji: 🦀
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.41.2
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
  ---
12
 
app.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run codes."""
2
+ # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
+ # ruff: noqa: E501
4
+ import gc
5
+ import os
6
+ import platform
7
+ import random
8
+ import time
9
+ from dataclasses import asdict, dataclass
10
+ from pathlib import Path
11
+ from typing import Optional, Sequence
12
+
13
+ # from types import SimpleNamespace
14
+ import gradio as gr
15
+ import psutil
16
+ from about_time import about_time
17
+ from ctransformers import AutoModelForCausalLM
18
+ from dl_hf_model import dl_hf_model
19
+ from examples_list import examples_list
20
+ from loguru import logger
21
+
22
+ url = "https://huggingface.co/TheBloke/CodeLlama-13B-Python-GGML/blob/main/codellama-13b-python.ggmlv3.Q4_K_M.bin" # 7.87G
23
+
24
+ LLM = None
25
+ gc.collect()
26
+
27
+ try:
28
+ logger.debug(f" dl {url}")
29
+ model_loc, file_size = dl_hf_model(url)
30
+ logger.info(f"done load llm {model_loc=} {file_size=}G")
31
+ except Exception as exc_:
32
+ logger.error(exc_)
33
+ raise SystemExit(1) from exc_
34
+
35
+ # raise SystemExit(0)
36
+
37
+ # Prompt template: Guanaco
38
+ # {past_history}
39
+ prompt_template = """You are a helpful assistant. Let's think step by step.
40
+ ### Human:
41
+ {question}
42
+ ### Assistant:"""
43
+
44
+ # Prompt template: garage-bAInd/Stable-Platypus2-13B
45
+ prompt_template = """
46
+ ### System:
47
+ This is a system prompt, please behave and help the user.
48
+
49
+ ### Instruction:
50
+
51
+ {question}
52
+
53
+ ### Response:
54
+ """
55
+ prompt_template = """
56
+ [INST] Write code to solve the following coding problem that obeys the constraints and
57
+ passes the example test cases. Please wrap your code answer using ```.
58
+ {question}
59
+ [/INST]
60
+ """
61
+
62
+ human_prefix = "### Instruction"
63
+ ai_prefix = "### Response"
64
+ stop_list = [f"{human_prefix}:"]
65
+
66
+ _ = psutil.cpu_count(logical=False) - 1
67
+ cpu_count: int = int(_) if _ else 1
68
+ logger.debug(f"{cpu_count=}")
69
+
70
+ logger.debug(f"{model_loc=}")
71
+ LLM = AutoModelForCausalLM.from_pretrained(
72
+ model_loc,
73
+ model_type="llama",
74
+ threads=cpu_count,
75
+ )
76
+
77
+ os.environ["TZ"] = "Asia/Shanghai"
78
+ try:
79
+ time.tzset() # type: ignore # pylint: disable=no-member
80
+ except Exception:
81
+ # Windows
82
+ logger.warning("Windows, cant run time.tzset()")
83
+
84
+
85
+ # ctransformers.Config() default
86
+ # Config(top_k=40, top_p=0.95, temperature=0.8,
87
+ # repetition_penalty=1.1, last_n_tokens=64, seed=-1,
88
+ # batch_size=8, threads=-1, max_new_tokens=256,
89
+ # stop=None, stream=False, reset=True,
90
+ # context_length=-1, gpu_layers=0)
91
+ @dataclass
92
+ class GenerationConfig:
93
+ temperature: float = 0.7
94
+ top_k: int = 50
95
+ top_p: float = 0.9
96
+ repetition_penalty: float = 1.0
97
+ max_new_tokens: int = 512
98
+ seed: int = 42
99
+ reset: bool = False
100
+ stream: bool = True
101
+ threads: int = cpu_count
102
+ # stop: list[str] = field(default_factory=lambda: stop_list)
103
+
104
+ # ctransformers\llm.py
105
+ @dataclass
106
+ class Config:
107
+ # sample
108
+ top_k: int = 40
109
+ top_p: float = 0.95
110
+ temperature: float = 0.8
111
+ repetition_penalty: float = 1.1
112
+ last_n_tokens: int = 64
113
+ seed: int = -1
114
+
115
+ # eval
116
+ batch_size: int = 8
117
+ threads: int = -1
118
+
119
+ # generate
120
+ max_new_tokens: int = 512 # 256
121
+ stop: Optional[Sequence[str]] = None
122
+ stream: bool = True # False
123
+ reset: bool = False # True
124
+
125
+ # model
126
+ # context_length: int = -1
127
+ # gpu_layers: int = 0
128
+
129
+
130
+ def generate(
131
+ question: str,
132
+ llm=LLM,
133
+ # config: GenerationConfig = GenerationConfig(),
134
+ config: Config = Config(),
135
+ ):
136
+ """Run model inference, will return a Generator if streaming is true."""
137
+ # _ = prompt_template.format(question=question)
138
+ # print(_)
139
+
140
+ prompt = prompt_template.format(question=question)
141
+
142
+ return llm(
143
+ prompt,
144
+ **asdict(config),
145
+ # **vars(config),
146
+ )
147
+
148
+
149
+ # logger.debug(f"{asdict(GenerationConfig())=}")
150
+ logger.debug(f"{Config(stream=True)=}")
151
+ logger.debug(f"{vars(Config(stream=True))=}")
152
+
153
+
154
+ def user(user_message, history):
155
+ # return user_message, history + [[user_message, None]]
156
+ if history is None:
157
+ history = []
158
+ history.append([user_message, None])
159
+ return user_message, history # keep user_message
160
+
161
+
162
+ def user1(user_message, history):
163
+ # return user_message, history + [[user_message, None]]
164
+ if history is None:
165
+ history = []
166
+ history.append([user_message, None])
167
+ return "", history # clear user_message
168
+
169
+
170
+ def bot_(history):
171
+ user_message = history[-1][0]
172
+ resp = random.choice(["How are you?", "I love you", "I'm very hungry"])
173
+ bot_message = user_message + ": " + resp
174
+ history[-1][1] = ""
175
+ for character in bot_message:
176
+ history[-1][1] += character
177
+ time.sleep(0.02)
178
+ yield history
179
+
180
+ history[-1][1] = resp
181
+ yield history
182
+
183
+
184
+ def bot(history):
185
+ user_message = ""
186
+ try:
187
+ user_message = history[-1][0]
188
+ except Exception as exc:
189
+ logger.error(exc)
190
+ response = []
191
+
192
+ logger.debug(f"{user_message=}")
193
+
194
+ with about_time() as atime: # type: ignore
195
+ flag = 1
196
+ prefix = ""
197
+ then = time.time()
198
+
199
+ logger.debug("about to generate")
200
+
201
+ config = GenerationConfig(reset=True)
202
+ for elm in generate(user_message, config=config):
203
+ if flag == 1:
204
+ logger.debug("in the loop")
205
+ prefix = f"({time.time() - then:.2f}s) "
206
+ flag = 0
207
+ print(prefix, end="", flush=True)
208
+ logger.debug(f"{prefix=}")
209
+ print(elm, end="", flush=True)
210
+ # logger.debug(f"{elm}")
211
+
212
+ response.append(elm)
213
+ history[-1][1] = prefix + "".join(response)
214
+ yield history
215
+
216
+ _ = (
217
+ f"(time elapsed: {atime.duration_human}, " # type: ignore
218
+ f"{atime.duration/len(''.join(response)):.2f}s/char)" # type: ignore
219
+ )
220
+
221
+ history[-1][1] = "".join(response) + f"\n{_}"
222
+ yield history
223
+
224
+
225
+ def predict_api(prompt):
226
+ logger.debug(f"{prompt=}")
227
+ try:
228
+ # user_prompt = prompt
229
+ config = GenerationConfig(
230
+ temperature=0.2,
231
+ top_k=10,
232
+ top_p=0.9,
233
+ repetition_penalty=1.0,
234
+ max_new_tokens=512, # adjust as needed
235
+ seed=42,
236
+ reset=True, # reset history (cache)
237
+ stream=False,
238
+ # threads=cpu_count,
239
+ # stop=prompt_prefix[1:2],
240
+ )
241
+
242
+ response = generate(
243
+ prompt,
244
+ config=config,
245
+ )
246
+
247
+ logger.debug(f"api: {response=}")
248
+ except Exception as exc:
249
+ logger.error(exc)
250
+ response = f"{exc=}"
251
+ # bot = {"inputs": [response]}
252
+ # bot = [(prompt, response)]
253
+
254
+ return response
255
+
256
+
257
+ css = """
258
+ .importantButton {
259
+ background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
260
+ border: none !important;
261
+ }
262
+ .importantButton:hover {
263
+ background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
264
+ border: none !important;
265
+ }
266
+ .disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
267
+ .xsmall {font-size: x-small;}
268
+ """
269
+
270
+ logger.info("start block")
271
+
272
+ with gr.Blocks(
273
+ title=f"{Path(model_loc).name}",
274
+ # theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
275
+ theme=gr.themes.Glass(text_size="sm", spacing_size="sm"),
276
+ css=css,
277
+ ) as block:
278
+ # buff_var = gr.State("")
279
+ with gr.Accordion("🎈 Info", open=False):
280
+ gr.Markdown(
281
+ f"""<h5><center>{Path(model_loc).name}</center></h4>
282
+ Most examples are meant for another model.
283
+ You probably should try to test
284
+ some related prompts.""",
285
+ elem_classes="xsmall",
286
+ )
287
+
288
+ # chatbot = gr.Chatbot().style(height=700) # 500
289
+ chatbot = gr.Chatbot(height=500)
290
+
291
+ # buff = gr.Textbox(show_label=False, visible=True)
292
+
293
+ with gr.Row():
294
+ with gr.Column(scale=5):
295
+ msg = gr.Textbox(
296
+ label="Chat Message Box",
297
+ placeholder="Ask me anything (press Shift+Enter or click Submit to send)",
298
+ show_label=False,
299
+ # container=False,
300
+ lines=6,
301
+ max_lines=30,
302
+ show_copy_button=True,
303
+ # ).style(container=False)
304
+ )
305
+ with gr.Column(scale=1, min_width=50):
306
+ with gr.Row():
307
+ submit = gr.Button("Submit", elem_classes="xsmall")
308
+ stop = gr.Button("Stop", visible=True)
309
+ clear = gr.Button("Clear History", visible=True)
310
+ with gr.Row(visible=False):
311
+ with gr.Accordion("Advanced Options:", open=False):
312
+ with gr.Row():
313
+ with gr.Column(scale=2):
314
+ system = gr.Textbox(
315
+ label="System Prompt",
316
+ value=prompt_template,
317
+ show_label=False,
318
+ container=False,
319
+ # ).style(container=False)
320
+ )
321
+ with gr.Column():
322
+ with gr.Row():
323
+ change = gr.Button("Change System Prompt")
324
+ reset = gr.Button("Reset System Prompt")
325
+
326
+ with gr.Accordion("Example Inputs", open=True):
327
+ examples = gr.Examples(
328
+ examples=examples_list,
329
+ inputs=[msg],
330
+ examples_per_page=40,
331
+ )
332
+
333
+ # with gr.Row():
334
+ with gr.Accordion("Disclaimer", open=False):
335
+ _ = Path(model_loc).name
336
+ gr.Markdown(
337
+ f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
338
+ f"factually accurate information. {_} was trained on various public datasets; while great efforts "
339
+ "have been taken to clean the pretraining data, it is possible that this model could generate lewd, "
340
+ "biased, or otherwise offensive outputs.",
341
+ elem_classes=["disclaimer"],
342
+ )
343
+
344
+ msg_submit_event = msg.submit(
345
+ # fn=conversation.user_turn,
346
+ fn=user,
347
+ inputs=[msg, chatbot],
348
+ outputs=[msg, chatbot],
349
+ queue=True,
350
+ show_progress="full",
351
+ # api_name=None,
352
+ ).then(bot, chatbot, chatbot, queue=True)
353
+ submit_click_event = submit.click(
354
+ # fn=lambda x, y: ("",) + user(x, y)[1:], # clear msg
355
+ fn=user1, # clear msg
356
+ inputs=[msg, chatbot],
357
+ outputs=[msg, chatbot],
358
+ queue=True,
359
+ # queue=False,
360
+ show_progress="full",
361
+ # api_name=None,
362
+ ).then(bot, chatbot, chatbot, queue=True)
363
+ stop.click(
364
+ fn=None,
365
+ inputs=None,
366
+ outputs=None,
367
+ cancels=[msg_submit_event, submit_click_event],
368
+ queue=False,
369
+ )
370
+ clear.click(lambda: None, None, chatbot, queue=False)
371
+
372
+ with gr.Accordion("For Chat/Translation API", open=False, visible=False):
373
+ input_text = gr.Text()
374
+ api_btn = gr.Button("Go", variant="primary")
375
+ out_text = gr.Text()
376
+
377
+ api_btn.click(
378
+ predict_api,
379
+ input_text,
380
+ out_text,
381
+ api_name="api",
382
+ )
383
+
384
+ # block.load(update_buff, [], buff, every=1)
385
+ # block.load(update_buff, [buff_var], [buff_var, buff], every=1)
386
+
387
+ # concurrency_count=5, max_size=20
388
+ # max_size=36, concurrency_count=14
389
+ # CPU cpu_count=2 16G, model 7G
390
+ # CPU UPGRADE cpu_count=8 32G, model 7G
391
+
392
+ # does not work
393
+ _ = """
394
+ # _ = int(psutil.virtual_memory().total / 10**9 // file_size - 1)
395
+ # concurrency_count = max(_, 1)
396
+ if psutil.cpu_count(logical=False) >= 8:
397
+ # concurrency_count = max(int(32 / file_size) - 1, 1)
398
+ else:
399
+ # concurrency_count = max(int(16 / file_size) - 1, 1)
400
+ # """
401
+
402
+ # default concurrency_count = 1
403
+ # block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)
404
+
405
+ server_port = 7860
406
+ if "forindo" in platform.node():
407
+ server_port = 7861
408
+ block.queue(max_size=5).launch(
409
+ debug=True, server_name="0.0.0.0", server_port=server_port
410
+ )
411
+
412
+ # block.queue(max_size=5).launch(debug=True, server_name="0.0.0.0")
examples_list.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Setup examples."""
2
+ # pylint: disable=invalid-name, line-too-long
3
+ examples_list = [
4
+ ["Python Program for Bubble Sort"],
5
+ ["Bubble Sort"],
6
+ ["Python Program to Print the Fibonacci sequence"],
7
+ ["""Convert js code "const numbers = [1, 2, 3, 4, 5]; console.log(numbers.includes(4));" to python code."""],
8
+ ["Print the Fibonacci sequence"],
9
+ ["给出判断一个数是不是质数的 python 码。"],
10
+ ["给出实现python 里 range(10)的 javascript 码。"],
11
+ ["给出实现python 里 [*(range(10)]的 javascript 码。"],
12
+ ]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ctransformers # ==0.2.10 0.2.13
2
+ transformers # ==4.30.2
3
+ # huggingface_hub
4
+ gradio
5
+ loguru
6
+ about-time
7
+ psutil
8
+ dl-hf-model