import gradio as gr
from llama_cpp import Llama


def llama_cpp_chat(gguf_model, prompt:str, messages:str = ''):
    prompt_templated = f'{messages}\n ### HUMAN:\n{prompt} \n ### ASSISTANT:'
    output = gguf_model(
          prompt_templated, # Prompt
          max_tokens=512, 
          stop=["### HUMAN:\n", " ### ASSISTANT:"], # Stop generating just before the model would generate a new question
          echo=True # Echo the prompt back in the output
    ) # Generate a completion, can also call create_completion
    print(output)
    return output['choices'][0]['text']

llm = Llama(
      model_path="llama3_8b_chat_brainstorm.Q2_K.gguf",
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)

def chatty(prompt, messages):
    print(prompt)
    print(f'messages: {messages}')
    past_messages = ''
    if len(messages) > 0:
        for idx, message in enumerate(messages):
            print(f'idx: {idx}, message: {message}')
            past_messages += f'\n### HUMAN: {message[0]}'
            past_messages += f'\n### ASSISTANT: {message[1]}'
                
                
        # past_messages = messages[0][0]
    print(f'past_messages: {past_messages}')
    messages = llama_cpp_chat(llm, prompt, past_messages)
    return messages.split('### ASSISTANT:')[-1]


demo = gr.ChatInterface(
    fn=chatty,
    title="Brainstorm on CPU with llama.cpp",
    description="Please note that CPU prediction will be very slow - but this can run on the Free Tier :)"
) 


if __name__ == "__main__":
    demo.launch()