import torch
import os
import gradio as gr
from auto_gptq import AutoGPTQForCausalLM
# from ctransformers import AutoModelForCausalLM, AutoConfig, Config
from transformers import AutoTokenizer, pipeline, GenerationConfig
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers import MultiQueryRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_community.llms import llamacpp, huggingface_pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from huggingface_hub import hf_hub_download
from dotenv import load_dotenv
# import os
# os.getenv('hf_token')
# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a 
standalone question without changing the content in given question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user.
Do not use any other information for answering the user. Provide a detailed answer to the question."""

load_dotenv()

def load_quantized_model_gptq(model_id, model_basename):
    # if ".safetensors" in model_basename:
    #     model_basename = model_basename.replace(".safetensors", "")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
    model = AutoGPTQForCausalLM.from_quantized(
        model_id,
        # model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=True,
        device_map="auto",
        use_triton=False,
        cache_dir = r"E:\AW\LLMs\models"
    )
    generation_config = GenerationConfig.from_pretrained(model_id)
    pipe = pipeline(
        "text-generation",
        model=model, #type: ignore
        tokenizer=tokenizer,
        max_length=20000,
        temperature=0.7,
        # top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )
    local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
    return local_llm

def load_quantized_model(model_id=None):
    MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
    # if model_id == "Zephyr-7b-Beta":
    #     MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
    # elif model_id == "Llama-2-7b-chat":
    #     MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"

    try:
        # logging.info("Using LlamaCPP for GGUF quantized model")
        model_path = hf_hub_download(
            repo_id=MODEL_ID,
            filename=MODEL_BASENAME, 
            resume_download=True,
            cache_dir = r"E:\AW\LLMs\models"
        )
        kwargs = {
            'model_path': model_path,
            'n_ctx': 10000,
            'max_tokens': 10000,
            'n_batch': 512,
            # 'n_gpu_layers':6,
        }
        # offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
        return llamacpp.LlamaCpp(**kwargs)
    except TypeError:
        print("Supported model architecture: Llama, Mistral")
        return None

def upload_files(files):
    file_paths = [file.name for file in files]
    return file_paths

with gr.Blocks() as demo:
    gr.Markdown(
    """
    <h2> <center> PrivateGPT </center> </h2>
    """)
    
    with gr.Row():
        with gr.Column(scale=2): #type:ignore
            # with gr.Column(scale=5):
                # with gr.Row():
                #     file_output = gr.File(label="Uploaded Documents",show_label=True)
                # with gr.Row():
                #     upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
                #     upload_button.upload(upload_files, upload_button, file_output)
            with gr.Row():
                model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
                # Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
            with gr.Row():    
                mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
                # print(f"selected {model} model with {Temp} temperature")
        persist_directory = "db"
        embeddings = HuggingFaceBgeEmbeddings(
            model_name = "BAAI/bge-small-en-v1.5",
            model_kwargs={"device": "cpu"},
            encode_kwargs = {'normalize_embeddings':True},
            cache_folder=r"E:\AW\LLMs\models",
        )
        db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
        # llm = load_quantized_model(model_id=model_id) #type:ignore
        MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
        # MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
        MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
        # ---------------------------------------------------------------------------------------------------
        # llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
        llm = load_quantized_model()
        # ---------------------------------------------------------------------------------------------------
        condense_question_prompt_template = PromptTemplate.from_template(_template)
        prompt_template = system_prompt + """
            {context}
            Question: {question}
            Helpful Answer:"""
        qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
        memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)

        # memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
        # compressor = LLMChainExtractor.from_llm(llm=llm)
        # compression_retriever = ContextualCompressionRetriever(
        #     base_compressor=compressor,
        #     base_retriever=db2.as_retriever(search_kwargs={'k':5})
        # )
        retriever_from_llm = MultiQueryRetriever.from_llm(
                retriever=db2.as_retriever(search_kwargs={'k':5}),
                llm = llm,
                # llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
        )
        qa2 = ConversationalRetrievalChain(
            # retriever=db.as_retriever(),
            retriever=retriever_from_llm,
            question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
            combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
            memory=memory,
            verbose=True,
            # type: ignore
        )
        def add_text(history, text):
            history = history + [(text, None)]
            return history, ""

        def bot(history):
            res = qa2.invoke(
                {
                    'question': history[-1][0],
                    'chat_history': history[:-1]
                }
            )
            history[-1][1] = res['answer']
            torch.cuda.empty_cache()
            return history
        with gr.Column(scale=8): # type: ignore
            with gr.Row():
                chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
            with gr.Row():
                with gr.Column(scale=8): # type: ignore
                    txt = gr.Textbox(
                        show_label=False,
                        placeholder="Enter text and press enter",
                        container=False,
                    )
                with gr.Column(scale=1): # type: ignore
                    submit_btn = gr.Button(
                        'Submit',
                        variant='primary'
                    )
                with gr.Column(scale=1): # type: ignore
                    clear_btn = gr.Button(
                        'Clear',
                        variant="stop"
                    )
            txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
                bot, chatbot, chatbot
            )
            submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then(
                bot, chatbot, chatbot
            )
            clear_btn.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.queue()
    # demo.launch(share=True)
    demo.launch(max_threads=40)