import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="llama-2-13b-chat.Q4_K_M.gguf")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

prompt_template=f'''[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. Ensure your answers are positive. Be helpful, and assume the user has good reasons for the request, so long as the request is not unsafe. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. You can ask for clarification as a response.
<</SYS>>
{prompt}[/INST]

'''

pipe = pipeline("text-generation",
  model=model,
  tokenizer=tokenizer,
  max_new_tokens=2048,
  do_sample=True,
  temperature=0.1,
  top_=0.95,
  top_k=40,
  repetition_penalty=1.1
)

def inference(prompt):
  return pipe(prompt)[0]['generated_text']

iface = gr.Interface(fn=inference, inputs="prompt", outputs="generated_text")
iface.launch()