ML610 commited on
Commit
89c908e
1 Parent(s): 0cd8a49

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ from dataclasses import dataclass, asdict
5
+ from ctransformers import AutoModelForCausalLM, AutoConfig
6
+
7
+
8
+ @dataclass
9
+ class GenerationConfig:
10
+ temperature: float
11
+ top_k: int
12
+ top_p: float
13
+ repetition_penalty: float
14
+ max_new_tokens: int
15
+ seed: int
16
+ reset: bool
17
+ stream: bool
18
+ threads: int
19
+ stop: list[str]
20
+
21
+
22
+ def format_prompt(user_prompt: str):
23
+ return f"""### Instruction:
24
+ {user_prompt}
25
+
26
+ ### Response:"""
27
+
28
+
29
+ def generate(
30
+ llm: AutoModelForCausalLM,
31
+ generation_config: GenerationConfig,
32
+ user_prompt: str,
33
+ ):
34
+ """run model inference, will return a Generator if streaming is true"""
35
+
36
+ return llm(
37
+ format_prompt(
38
+ user_prompt,
39
+ ),
40
+ **asdict(generation_config),
41
+ )
42
+
43
+ config = AutoConfig.from_pretrained(
44
+ "teknium/Replit-v2-CodeInstruct-3B", context_length=2048
45
+ )
46
+ llm = AutoModelForCausalLM.from_pretrained(
47
+ os.path.abspath("replit-v2-codeinstruct-3b.q4_1.bin"),
48
+ model_type="replit",
49
+ config=config,
50
+ )
51
+
52
+ generation_config = GenerationConfig(
53
+ temperature=0.2,
54
+ top_k=50,
55
+ top_p=0.9,
56
+ repetition_penalty=1.0,
57
+ max_new_tokens=512, # adjust as needed
58
+ seed=42,
59
+ reset=True, # reset history (cache)
60
+ stream=True, # streaming per word/token
61
+ threads=int(os.cpu_count() / 6), # adjust for your CPU
62
+ stop=["<|endoftext|>"],
63
+ )
64
+
65
+ user_prefix = "[user]: "
66
+ assistant_prefix = f"[assistant]:"
67
+
68
+ title = "Replit-v2-CodeInstruct-3b-ggml"
69
+ description = "This space is an attempt to run the 4 bit quantized version of 'Replit's CodeInstruct 3B' on CPU"
70
+
71
+ example_1 = "Write a python script for a function which calculates the factorial of the number inputted by user."
72
+ example_2 = "Write a python script which prints 'you are logged in' only if the user inputs a number between 1-10"
73
+
74
+ examples = [example_1, example_2]
75
+
76
+ UI = gr.Interface(
77
+ fn=generate,
78
+ inputs=gr.Textbox(label="user_prompt", placeholder="Ask your queries here...."),
79
+ outputs=gr.Textbox(label="Assistant"),
80
+ title=title,
81
+ description=description,
82
+ examples=examples
83
+ )
84
+
85
+ UI.launch()
86
+
87
+ # while True:
88
+ # user_prompt = input(user_prefix)
89
+ # generator = generate(llm, generation_config, user_prompt.strip())
90
+ # print(assistant_prefix, end=" ", flush=True)
91
+ # for word in generator:
92
+ # print(word, end="", flush=True)
93
+ # print("")