# -*- coding: utf-8 -*-
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
import torch
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

documents = SimpleDirectoryReader("./data").load_data()

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf',
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

from llama_index.embeddings import HuggingFaceEmbedding

# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()

# loads BAAI/bge-small-en-v1.5
# intilaize our custom embeddings
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

service_context = ServiceContext.from_defaults(
    chunk_size=512,
    llm=llm,
    embed_model=embed_model
)

"""Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy"""

from llama_index.indices.postprocessor import SentenceTransformerRerank
# Initialize the reranker
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks

#create query engine
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine() # Without reranker

def predict(input, history):
  response = query_engine.query(input)
  return str(response)

#create query engine with cross encoder reranker
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks.

def predict(input, history):
  response = query_engine.query(input)
  return str(response)

import time
import gradio as gr

def predict(input, history):
    start_time = time.time()  # Start the timer

    response = query_engine.query(input)  # Process the query

    end_time = time.time()  # Stop the timer
    response_time = end_time - start_time  # Calculate the time taken

    # Format the response to include the time taken
    timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)"
    return str(timed_response)

# Launch gradio chat ui
gr.ChatInterface(predict).launch(share=True, debug=True)