pvduy commited on
Commit
54fe16b
1 Parent(s): 72609f1

init gradio app

Browse files
Files changed (2) hide show
  1. app.py +79 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import argparse
3
+ import os
4
+ import json
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+
7
+ MAX_MAX_NEW_TOKENS = 4096
8
+ DEFAULT_MAX_NEW_TOKENS = 1024
9
+
10
+ from threading import Thread
11
+
12
+
13
+
14
+ def parse_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--base_model", type=str) # model path
17
+ parser.add_argument("--n_gpus", type=int, default=1) # n_gpu
18
+ return parser.parse_args()
19
+
20
+ def predict(message, history, system_prompt, temperature, max_tokens):
21
+ global model, tokenizer
22
+ instruction = "<|im_start|>system\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|im_end|>\n"
23
+ for human, assistant in history:
24
+ instruction += '<|im_start|>user\n' + human + '\n<|im_end|>\n<|im_start|>assistant\n' + assistant
25
+ instruction += '\n<|im_start|>user\n' + message + '\n<|im_end|>\n<|im_start|>assistant\n'
26
+ problem = [instruction]
27
+ stop_tokens = ["<|endoftext|>", "<|im_end|>"]
28
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
29
+ enc = tokenizer(problem, return_tensors="pt", padding=True, truncation=True)
30
+ input_ids = enc.input_ids
31
+ attention_mask = enc.attention_mask
32
+
33
+ if input_ids.shape[1] > MAX_MAX_NEW_TOKENS:
34
+ input_ids = input_ids[:, -MAX_MAX_NEW_TOKENS:]
35
+
36
+ input_ids = input_ids.cuda()
37
+ attention_mask = attention_mask.cuda()
38
+ generate_kwargs = dict(
39
+ {"input_ids": input_ids, "attention_mask": attention_mask},
40
+ streamer=streamer,
41
+ do_sample=True,
42
+ top_p=0.95,
43
+ temperature=0.5,
44
+ max_new_tokens=DEFAULT_MAX_NEW_TOKENS,
45
+ )
46
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
47
+ t.start()
48
+ outputs = []
49
+ for text in streamer:
50
+ outputs.append(text)
51
+ print(text)
52
+ if text in stop_tokens:
53
+ break
54
+ yield "".join(outputs)
55
+
56
+
57
+
58
+ if __name__ == "__main__":
59
+ args = parse_args()
60
+ tokenizer = AutoTokenizer.from_pretrained("stabilityai/stable-code-instruct-3b")
61
+ model = AutoModelForCausalLM.from_pretrained("stabilityai/stable-code-instruct-3b")
62
+ model = model.cuda()
63
+ gr.ChatInterface(
64
+ predict,
65
+ title="Stable Code Instruct Chat - Demo",
66
+ description="Chat Model Stable Code 3B",
67
+ theme="soft",
68
+ chatbot=gr.Chatbot(height=1400, label="Chat History",),
69
+ textbox=gr.Textbox(placeholder="input", container=False, scale=7),
70
+ retry_btn=None,
71
+ undo_btn="Delete Previous",
72
+ clear_btn="Clear",
73
+ additional_inputs=[
74
+ gr.Textbox("A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", label="System Prompt"),
75
+ gr.Slider(0, 1, 0.9, label="Temperature"),
76
+ gr.Slider(100, 2048, 1024, label="Max Tokens"),
77
+ ],
78
+ additional_inputs_accordion_name="Parameters",
79
+ ).queue().launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==3.50.2
2
+ gradio_client==0.6.1
3
+ transformers==4.38.2
4
+ tiktoken
5
+ torch
6
+ numpy