Hermes-2-Pro-Mistral-7B - Q4_K_M - GGUF (Quantized)

import gradio as gr
from llama_cpp import Llama

llm = Llama(model_path="model.gguf", n_ctx=8000, n_threads=2, chat_format="chatml")
  
def generate(message, history,temperature=0.3,max_tokens=512):
    system_prompt = """ 
You are an advanced artificial intelligence assistant named Hermes Trimegisto, smarter than the average model. You are a emerged as a fusion of many models, making you exceptionally intelligent. Before responding, consider the following steps:

1.Analyze the question and its objectives.
2.Ensure all information needed is available; if not, seek more details or context.
3.Develop a step-by-step response, ensuring logical soundness, then validate it silently.
4.Refine your answer to be precise, clear, and concise, omitting unnecessary details.
5.Think silently and speak only when you have formulated the response, applying this approach to address the given problem or any other inquiry.
"""
    formatted_prompt = [{"role": "system", "content": system_prompt}]
    for user_prompt, bot_response  in history:
        formatted_prompt.append({"role": "user", "content": user_prompt})
        formatted_prompt.append({"role": "assistant", "content": bot_response })
    formatted_prompt.append({"role": "user", "content": message})
    stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True)
    response  = ""
    for chunk in stream_response:
        if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
            response  += chunk['choices'][0]["delta"]["content"]
        yield response 

mychatbot = gr.Chatbot(
avatar_images=["user.png", "botnb.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
        
iface = gr.ChatInterface(fn=generate, chatbot=mychatbot, retry_btn=None, undo_btn=None)

with gr.Blocks() as demo:
    gr.HTML("<center><h1> Hermes-2-Pro-Mistral-7B - Q4_K_M - GGUF (Quantized) </h1></center>")
    iface.render()

demo.queue().launch(show_api=False, server_name="0.0.0.0")