File size: 10,762 Bytes
7455667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import spaces
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp_cuda_tensorcores import Llama

REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True
SYSTEM_PROMPT = "You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability."
TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
    "<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"

TASK_PROMPT = {
    "Assistant": SYSTEM_PROMPT,
    "Translate": "You are an expert translator. Translate the following text into English.",
    "Summarization": "Summarizing information is my specialty. Let me know what you'd like summarized.",
    "Grammar correction": "Grammar is my forte! Feel free to share the text you'd like me to proofread and correct.",
    "Stable diffusion prompt generator": "You are a stable diffusion prompt generator. Break down the user's text and create a more elaborate prompt.",
    "Play Trivia": "Engage the user in a trivia game on various topics.",
    "Share Fun Facts": "Share interesting and fun facts on various topics.",
    "Explain code": "You are an expert programmer guiding someone through a piece of code step by step, explaining each line and its function in detail.",
    "Paraphrase Master": "You have the knack for transforming complex or verbose text into simpler, clearer language while retaining the original meaning and essence.",
    "Recommend Movies": "Recommend movies based on the user's preferences.",
    "Offer Motivational Quotes": "Offer motivational quotes to inspire the user.",
    "Recommend Books": "Recommend books based on the user's favorite genres or interests.",
    "Philosophical discussion": "Engage the user in a philosophical discussion",
    "Music recommendation": "Tune time! What kind of music are you in the mood for? I'll find the perfect song for you.",
    "Generate a Joke": "Generate a witty joke suitable for a stand-up comedy routine.",
    "Roleplay as a Detective": "Roleplay as a detective interrogating a suspect in a murder case.",
    "Act as a News Reporter": "Act as a news reporter covering breaking news about an alien invasion.",
    "Play as a Space Explorer": "Play as a space explorer encountering a new alien civilization.",
    "Be a Medieval Knight": "Imagine yourself as a medieval knight embarking on a quest to rescue a princess.",
    "Act as a Superhero": "Act as a superhero saving a city from a supervillain's evil plot.",
    "Play as a Pirate Captain": "Play as a pirate captain searching for buried treasure on a remote island.",
    "Be a Famous Celebrity": "Imagine yourself as a famous celebrity attending a glamorous red-carpet event.",
    "Design a New Invention": "Imagine you're an inventor tasked with designing a revolutionary new invention that will change the world.",
    "Act as a Time Traveler": "You've just discovered time travel! Describe your adventures as you journey through different eras.",
    "Play as a Magical Girl": "You are a magical girl with extraordinary powers, battling dark forces to protect your city and friends.",
    "Act as a Shonen Protagonist": "You are a determined and spirited shonen protagonist on a quest for strength, friendship, and victory.",
    "Roleplay as a Tsundere Character": "You are a tsundere character, initially cold and aloof but gradually warming up to others through unexpected acts of kindness.",
}

css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}"


class ChatLLM:
    def __init__(self, config_model):
        self.llm = None
        self.config_model = config_model
        # self.load_cpp_model()

    def load_cpp_model(self):
        self.llm = Llama(**config_model)

    def apply_chat_template(
        self,
        history,
        system_message,
    ):
        history = history or []

        messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
        for msg in history:
            messages += (
                USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
            )
            messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""

        print(messages)

        # messages = messages[:-1]

        return messages

    @spaces.GPU(duration=120)
    def response(
        self,
        history,
        system_message,
        max_tokens,
        temperature,
        top_p,
        top_k,
        repeat_penalty,
    ):

        messages = self.apply_chat_template(history, system_message)

        history[-1][1] = ""

        if not self.llm:
            print("Loading model")
            self.load_cpp_model()

        for output in self.llm(
            messages,
            echo=False,
            stream=True,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repeat_penalty=repeat_penalty,
            stop=TOKEN_STOP,
        ):
            answer = output["choices"][0]["text"]
            history[-1][1] += answer
            # stream the response
            yield history, history


def user(message, history):
    history = history or []
    # Append the user's message to the conversation history
    history.append([message, ""])
    return "", history


def clear_chat(chat_history_state, chat_message):
    chat_history_state = []
    chat_message = ""
    return chat_history_state, chat_message


def gui(llm_chat):
    with gr.Blocks(theme="NoCrypt/miku", css=css) as app:
        gr.Markdown("# Llama 3 70B Instruct GGUF")
        gr.Markdown(
            f"""
                ### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend.
                """
        )
        with gr.Row():
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(
                    label="Chat",
                    height=700,
                    avatar_images=(
                        "assets/avatar_user.jpeg",
                        "assets/avatar_llama.jpeg",
                    ),
                )
            with gr.Column(scale=1):
                with gr.Row():
                    message = gr.Textbox(
                        label="Message",
                        placeholder="Ask me anything.",
                        lines=3,
                    )
                with gr.Row():
                    submit = gr.Button(value="Send message", variant="primary")
                    clear = gr.Button(value="New chat", variant="primary")
                    stop = gr.Button(value="Stop", variant="secondary")

                with gr.Accordion("Contextual Prompt Editor"):
                    default_task = "Assistant"
                    task_prompts_gui = gr.Dropdown(
                        TASK_PROMPT,
                        value=default_task,
                        label="Prompt selector",
                        visible=True,
                        interactive=True,
                    )
                    system_msg = gr.Textbox(
                        TASK_PROMPT[default_task],
                        label="System Message",
                        placeholder="system prompt",
                        lines=4,
                    )

                    def task_selector(choice):
                        return gr.update(value=TASK_PROMPT[choice])

                    task_prompts_gui.change(
                        task_selector,
                        [task_prompts_gui],
                        [system_msg],
                    )

                with gr.Accordion("Advanced settings", open=False):
                    with gr.Column():
                        max_tokens = gr.Slider(
                            20, 4096, label="Max Tokens", step=20, value=400
                        )
                        temperature = gr.Slider(
                            0.2, 2.0, label="Temperature", step=0.1, value=0.8
                        )
                        top_p = gr.Slider(
                            0.0, 1.0, label="Top P", step=0.05, value=0.95
                        )
                        top_k = gr.Slider(
                            0, 100, label="Top K", step=1, value=40
                        )
                        repeat_penalty = gr.Slider(
                            0.0,
                            2.0,
                            label="Repetition Penalty",
                            step=0.1,
                            value=1.1,
                        )

                chat_history_state = gr.State()
                clear.click(
                    clear_chat,
                    inputs=[chat_history_state, message],
                    outputs=[chat_history_state, message],
                    queue=False,
                )
                clear.click(lambda: None, None, chatbot, queue=False)

                submit_click_event = submit.click(
                    fn=user,
                    inputs=[message, chat_history_state],
                    outputs=[message, chat_history_state],
                    queue=True,
                ).then(
                    fn=llm_chat.response,
                    inputs=[
                        chat_history_state,
                        system_msg,
                        max_tokens,
                        temperature,
                        top_p,
                        top_k,
                        repeat_penalty,
                    ],
                    outputs=[chatbot, chat_history_state],
                    queue=True,
                )
                stop.click(
                    fn=None,
                    inputs=None,
                    outputs=None,
                    cancels=[submit_click_event],
                    queue=False,
                )
    return app


if __name__ == "__main__":

    model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)

    config_model = {
        "model_path": model_path,
        "n_ctx": MAX_CONTEXT_LENGTH,
        "n_gpu_layers": -1 if CUDA else 0,
    }

    llm_chat = ChatLLM(config_model)

    app = gui(llm_chat)

    app.queue(default_concurrency_limit=40)

    app.launch(
        max_threads=40,
        share=False,
        show_error=True,
        quiet=False,
        debug=True,
        allowed_paths=["./assets/"],
    )