Tomoniai commited on
Commit
d79aeef
1 Parent(s): f22bf4d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from threading import Thread
4
+ from typing import Iterator
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+
7
+ MAX_MAX_NEW_TOKENS = 1024
8
+ MAX_INPUT_TOKEN_LENGTH = 2048
9
+
10
+ base_model_name = "m-a-p/OpenCodeInterpreter-DS-1.3B"
11
+ model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True)
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
14
+
15
+ def format_prompt(message, history):
16
+ system_prompt = "You are OpenCodeInterpreter, you are an expert programmer that helps to write code based on the user request, with concise explanations."
17
+ prompt = []
18
+ prompt.append({"role": "system", "content": system_prompt})
19
+ for user_prompt, bot_response in history:
20
+ prompt.extend([{"role": "user", "content": user_prompt}, {"role": "assistant", "content": bot_response}])
21
+ prompt.append({"role": "user", "content": message})
22
+ return prompt
23
+
24
+ def generate(prompt: str, history: list[tuple[str, str]], max_new_tokens: int = 1024, temperature: float = 0.3,
25
+ top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1 ) -> Iterator[str]:
26
+
27
+ temperature = float(temperature)
28
+ if temperature < 1e-2:
29
+ temperature = 1e-2
30
+
31
+ formatted_prompt = []
32
+ formatted_prompt = format_prompt(prompt, history)
33
+
34
+ input_ids = tokenizer.apply_chat_template(formatted_prompt, return_tensors="pt", add_generation_prompt=True)
35
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
36
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
37
+ input_ids = input_ids.to(model.device)
38
+
39
+ streamer = TextIteratorStreamer(tokenizer, timeout=15.0, skip_prompt=True, skip_special_tokens=True)
40
+ generation_kwargs = dict({"input_ids": input_ids}, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=False, top_p=top_p, top_k=top_k,
41
+ temperature=temperature, num_beams=1, repetition_penalty=repetition_penalty, eos_token_id=tokenizer.eos_token_id)
42
+
43
+ t = Thread(target=model.generate, kwargs=generation_kwargs )
44
+ t.start()
45
+
46
+ outputs = []
47
+ for chunk in streamer:
48
+ outputs.append(chunk)
49
+ yield "".join(outputs).replace("<|EOT|>","")
50
+
51
+
52
+ mychatbot = gr.Chatbot(layout="bubble", avatar_images=["user.png", "botoci.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
53
+
54
+ additional_inputs = additional_inputs=[
55
+ gr.Slider(
56
+ label="Max new tokens",
57
+ minimum=1,
58
+ maximum=MAX_MAX_NEW_TOKENS,
59
+ step=1,
60
+ value=512,
61
+ ),
62
+ gr.Slider(
63
+ label="Temperature",
64
+ minimum=0,
65
+ maximum=1.0,
66
+ step=0.1,
67
+ value=0.3,
68
+ ),
69
+ gr.Slider(
70
+ label="Top-p",
71
+ minimum=0.05,
72
+ maximum=1.0,
73
+ step=0.05,
74
+ value=0.9,
75
+ ),
76
+ gr.Slider(
77
+ label="Top-k",
78
+ minimum=1,
79
+ maximum=1000,
80
+ step=1,
81
+ value=50,
82
+ ),
83
+ gr.Slider(
84
+ label="Repetition penalty",
85
+ minimum=1.0,
86
+ maximum=2.0,
87
+ step=0.05,
88
+ value=1,
89
+ )]
90
+
91
+ iface = gr.ChatInterface(fn=generate,
92
+ chatbot=mychatbot,
93
+ additional_inputs=additional_inputs,
94
+ description=" Running on CPU. The response may be slow for cpu environments. 🙏🏻",
95
+ retry_btn=None,
96
+ undo_btn=None
97
+ )
98
+
99
+ with gr.Blocks() as demo:
100
+ gr.HTML("<center><h1>Tomoniai's Chat with OpenCodeInterpreter</h1></center>")
101
+ iface.render()
102
+
103
+ demo.queue(max_size=10).launch(show_api=False)