from fastapi import FastAPI
from transformers import pipeline
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor
from llama_cpp import Llama

from huggingface_hub import hf_hub_download

# NOTE - we configure docs_url to serve the interactive Docs at the root path
# of the app. This way, we can use the docs as a landing page for the app on Spaces.
app = FastAPI(docs_url="/")

# Create embeddings model with content support
# embeddings = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True})
# embeddings.load('index')

# Create extractor instance
#extractor = Extractor(embeddings, "google/flan-t5-base")

# pipe = pipeline(model="TheBloke/Llama-2-7B-GGML/llama-2-7b.ggmlv3.q4_0.bin")

model_name_or_path = "TheBloke/Llama-2-7B-GGML"
model_basename = "llama-2-7b.ggmlv3.q4_0.bin"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

llm = Llama(model_path=model_path)

@app.get("/generate")
def generate(text: str):
    """
    llama2 q4 backend
    """
    output = llm(text)
    return {"output": output[0]["generated_text"]}


def prompt(question):
  return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
            Question: {question}
            Context: """


def search(query, question=None):
  # Default question to query if empty
  if not question:
    question = query

  return extractor([("answer", query, prompt(question), False)])[0][1]


# @app.get("/rag")
# def rag(question: str):
#     # question = "what is the document about?"
#     answer = search(question)
#     # print(question, answer)
#     return {answer}