import os
import hashlib
import pickle
import streamlit as st
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import PyPDF2

# Initialize the client
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

# Initialize Sentence Transformer model
encoder = SentenceTransformer("all-mpnet-base-v2")

# Function to compute directory hash
def compute_directory_hash(directory):
    hash_md5 = hashlib.md5()
    for root, _, files in os.walk(directory):
        for file in sorted(files):
            file_path = os.path.join(root, file)
            with open(file_path, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
    return hash_md5.hexdigest()

# Load documents and create embeddings
def load_documents_and_create_embeddings(directory):
    documents = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                with open(file_path, "rb") as f:
                    reader = PyPDF2.PdfReader(f)
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text()
                    documents.append(text)
    
    embeddings = encoder.encode(documents)
    return documents, embeddings

# Load or update cache
def load_or_update_cache(directory):
    cache_file = "cache.pkl"
    dir_hash = compute_directory_hash(directory)
    
    if os.path.exists(cache_file):
        with open(cache_file, "rb") as f:
            cache = pickle.load(f)
        if cache["hash"] == dir_hash:
            return cache["documents"], cache["embeddings"]
    
    documents, embeddings = load_documents_and_create_embeddings(directory)
    with open(cache_file, "wb") as f:
        pickle.dump({
            "hash": dir_hash,
            "documents": documents,
            "embeddings": embeddings
        }, f)
    
    return documents, embeddings

# Function to format the prompt
def format_prompt(message, history):
    prompt = "<s>"
    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST]"
        prompt += f" {bot_response} "
    prompt += f"[INST] {message} [/INST]"
    return prompt

# Function to generate response
def generate(prompt, history, temperature=0.3, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
    temperature = max(float(temperature), 1e-2)
    top_p = float(top_p)
    
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )
    
    formatted_prompt = format_prompt(prompt, history)
    
    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""
    for response in stream:
        output += response.token.text
    return output

# Load documents and create embeddings
directory = "./data"
documents, embeddings = load_or_update_cache(directory)

# Streamlit interface
st.title("Preguntale al Buho")

# Chat history
if 'history' not in st.session_state:
    st.session_state.history = []

# User input
user_input = st.text_input("Tu duda:", key="user_input")

# Generate response and update history
if st.button("Enviar"):
    if user_input:
        question_embedding = encoder.encode([user_input])
        similarities = cosine_similarity(question_embedding, embeddings)
        most_similar_idx = np.argmax(similarities)
        retrieved_doc = documents[most_similar_idx]
        history = st.session_state.history.copy()
        prompt = f"Contexto: {retrieved_doc}\nPregunta: {user_input}"
        bot_response = generate(prompt, history)
        st.session_state.history.append((user_input, bot_response))

# Display conversation
chat_text = ""
for user_msg, bot_msg in st.session_state.history:
    chat_text += f"Tu: {user_msg}\nBuhIA: {bot_msg}\n\n"
st.text_area("La respuesta", value=chat_text, height=300, disabled=False)