Sqlcoder-7b - Q5_K_M - TheBloke (quantization) & Kukedlc (deploy)

import gradio as gr
from llama_cpp import Llama

llm = Llama(model_path="model.gguf", n_ctx=8000, n_threads=2, chat_format="chatml")
  
def generate(message, history,temperature=0.3,max_tokens=512):
    system_prompt = """You are a super Inteligent AI assistant in SQL ANSI coding.You will give SQL ANSI queries. I want you to think step by step through each query without making any syntax errors. Always use the tables and columns provided to you in the conversation. Never invent tables or columns that do not exist. Be very smart in solving SQL queries, think deductively first, and inductively afterwards. And always think step by step. The answers must be precise, clear, and without talking more than what is asked for. Concise answers from an SQL expert. The query and a very brief description"""
    formatted_prompt = [{"role": "system", "content": system_prompt}]
    for user_prompt, bot_response  in history:
        formatted_prompt.append({"role": "user", "content": user_prompt})
        formatted_prompt.append({"role": "assistant", "content": bot_response })
    formatted_prompt.append({"role": "user", "content": message})
    stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True)
    response  = ""
    for chunk in stream_response:
        if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
            response  += chunk['choices'][0]["delta"]["content"]
        yield response 

mychatbot = gr.Chatbot(
avatar_images=["user.png", "botnb.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
        
iface = gr.ChatInterface(fn=generate, chatbot=mychatbot, retry_btn=None, undo_btn=None)

with gr.Blocks() as demo:
    gr.HTML("<center><h1>Sqlcoder-7b - Q5_K_M - TheBloke (quantization) & Kukedlc (deploy) </h1></center>")
    iface.render()

demo.queue().launch(show_api=False, server_name="0.0.0.0")