import streamlit as st
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings,CohereEmbeddings
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_community.chat_models import ChatCohere
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain_community.llms import HuggingFaceHub,HuggingFaceTextGenInference


#Llama2
import torch
import transformers
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer
from torch import cuda, bfloat16
import langchain
langchain.verbose = False


def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=500, # the character length of the chunck
        chunk_overlap=100, # the character length of the overlap between chuncks
        length_function=len # the length function - in this case, character length (aka the python len() fn.)
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks,selected_embedding):
    print('Selected Embedding: ' + selected_embedding)
    if selected_embedding == 'OpenAI':
        embeddings = OpenAIEmbeddings()
    elif selected_embedding == 'Instructor-xl':
        
        embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    elif selected_embedding == 'Cohere-multilingual-v3.0':
        embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    vectorstore.save_local("faiss_index")
    return vectorstore

def load_vectorstore(text_chunks,selected_embedding):
    print('Selected Embedding: ' + selected_embedding)
    if selected_embedding == 'OpenAI':
        embeddings = OpenAIEmbeddings()
    elif selected_embedding == 'Instructor-xl':
        embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
        vectorstore = FAISS.load_local("faiss_index", embeddings)
    elif selected_embedding == 'Cohere-multilingual-v3.0':
        embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")
    
    vectorstore = FAISS.load_local("faiss_index", embeddings)
    return vectorstore

def get_conversation_chain(vectorstore,selected_llm,selected_temperature):
    print('Selected LLM: ' + selected_llm)
    print('Selected Temperature: ' + str(selected_temperature))
    
    if selected_llm == 'GPT 3.5':
        #openai_model = "gpt-4-turbo-preview"
        openai_model = "gpt-3.5-turbo"
        llm = ChatOpenAI(model=openai_model,temperature=selected_temperature)
    elif selected_llm == 'Llama2 local':
        
        model_id = 'meta-llama/Llama-2-7b-chat-hf'
        hf_auth = os.environ.get("HUGGINGFACEHUB_API_TOKEN") 
        
        model_config = transformers.AutoConfig.from_pretrained(
            model_id,
            token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
        )

        device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

        if('cuda' in device):
            # set quantization configuration to load large model with less GPU memory
            # this requires the `bitsandbytes` library
            bnb_config = transformers.BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type='nf4',
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=bfloat16
            )

            model = transformers.AutoModelForCausalLM.from_pretrained(
                model_id,
                trust_remote_code=True,
                config=model_config,
                quantization_config=bnb_config,
                device_map='auto',
                token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
            )
        else:
            model = transformers.AutoModelForCausalLM.from_pretrained(
                model_id,
                trust_remote_code=True,
                config=model_config,
                device_map='auto',
                token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
            )

        # enable evaluation mode to allow model inference
        model.eval()
        print(f"Model loaded on {device}")

        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_id,
            token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
        )

        pipeline = transformers.pipeline(        
            torch_dtype=torch.float32,
            model=model, 
            tokenizer=tokenizer,
            return_full_text=True,  # langchain expects the full text
            task='text-generation',
            temperature=selected_temperature,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
            max_new_tokens=512,  # max number of tokens to generate in the output
            repetition_penalty=1.1  # without this output begins repeating
        )

        llm = HuggingFacePipeline(pipeline=pipeline)

    elif selected_llm == 'Llama2 inference':  
        llm = HuggingFaceTextGenInference(
        inference_server_url=os.environ.get("INFERENCE_URL"),
        max_new_tokens=50,
        timeout=1200,
        temperature=selected_temperature 
)  

    # Generic LLM    
    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')

           
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        return_source_documents=True,
        verbose=True,
    )
    #print(conversation_chain)
    
    return conversation_chain


def handle_userinput(user_question):

    #print('Question: ' + user_question)
    response = st.session_state.conversation.invoke({'question': user_question}) 
    

    anser = response.get("answer")
    sources = response.get("source_documents", [])
    #print('Answer: ' + anser)
    #print('Sources: ' + str(sources))
    with st.expander("Sources"):
        st.write(str(sources))

    st.session_state.chat_history = response['chat_history']
    
    for i, message in enumerate(st.session_state.chat_history):       
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)


def main():
    load_dotenv()
    st.set_page_config(page_title="VerAi",
                       page_icon=":books:")
    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:                          
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None


    with st.sidebar:
        st.subheader("Your documents")
        pdf_docs = st.file_uploader(
            "Upload your new PDFs here and click on 'Process' or load the last upload by clicking on 'Load'", accept_multiple_files=True)

        selected_embedding = st.radio("Which Embedding?",["Cohere-multilingual-v3.0","OpenAI", "Instructor-xl"])
        selected_llm = st.radio("Which LLM?",["GPT 3.5", "Llama2 local" ,"Llama2 inference"])
        selected_temperature = st.slider('Temperature?', 0.0, 1.0, 0.1)

        if st.button("Process"):
            with st.spinner("Processing"):
                # get pdf text
                raw_text = get_pdf_text(pdf_docs)

                # get the text chunks
                text_chunks = get_text_chunks(raw_text)

                # create vector store
                vectorstore = get_vectorstore(text_chunks,selected_embedding)

                # create conversation chain
                st.session_state.conversation = get_conversation_chain(
                    vectorstore,selected_llm,selected_temperature)

        if st.button("Load"):
            with st.spinner("Processing"):

                # load vector store
                vectorstore = load_vectorstore(selected_embedding,selected_embedding)

                # create conversation chain
                st.session_state.conversation = get_conversation_chain(
                    vectorstore,selected_llm,selected_temperature)
    
    if st.session_state.conversation:
        st.header("VerAi :books:")
        user_question = st.text_input("Stel een vraag hieronder")  
        # Vertel me iets over Wettelijke uren
        # wat zijn Overige verloftypes bij kpn  
    if st.session_state.conversation and user_question:
        handle_userinput(user_question)

if __name__ == '__main__':
    main()