# Intento construir un Rag con un buen promt
# Las librerias 
import os
import hashlib
import pickle
import streamlit as st
from google.generativeai import configure, GenerativeModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import PyPDF2

# Configuracion de  la API de Google (Manejo una variable interna)
configure(api_key=os.getenv('GOOGLE_API_KEY'))

# Inicializar el modelo Gemini
model = GenerativeModel('gemini-1.5-flash')
chat = model.start_chat()

# Inicializar el modelo Sentence Transformer
encoder = SentenceTransformer("all-mpnet-base-v2")

# Función para calcular el hash del directorio
def compute_directory_hash(directory):
    hash_md5 = hashlib.md5()
    for root, _, files in os.walk(directory):
        for file in sorted(files):
            file_path = os.path.join(root, file)
            with open(file_path, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
    return hash_md5.hexdigest()

# Función para dividir texto en chunks
def split_into_chunks(text, chunk_size=1000):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Función para cargar documentos y crear embeddings
def load_documents_and_create_embeddings(directory):
    documents = []
    file_chunks = {}
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "rb") as f:
                        reader = PyPDF2.PdfReader(f)
                        text = ""
                        for page in reader.pages:
                            page_text = page.extract_text()
                            if page_text:
                                text += page_text
                        if text:
                            chunks = split_into_chunks(text)
                            file_chunks[file] = len(chunks)
                            documents.extend(chunks)
                        else:
                            print(f"Advertencia: No se pudo extraer texto del archivo {file_path}")
                except Exception as e:
                    print(f"Error al procesar {file_path}: {e}")
    
    if not documents:
        return [], None, {}  # No se encontraron documentos PDF válidos
    
    embeddings = encoder.encode(documents)
    return documents, embeddings, file_chunks

# Función para cargar o actualizar caché
def load_or_update_cache(directory):
    cache_file = "cache.pkl"
    dir_hash = compute_directory_hash(directory)
    
    if os.path.exists(cache_file):
        with open(cache_file, "rb") as f:
            cache = pickle.load(f)
        if cache["hash"] == dir_hash:
            return cache["documents"], cache["embeddings"], cache["file_chunks"]
    
    documents, embeddings, file_chunks = load_documents_and_create_embeddings(directory)
    if embeddings is not None:  # Solo actualizar caché si se encontraron documentos válidos
        with open(cache_file, "wb") as f:
            pickle.dump({
                "hash": dir_hash,
                "documents": documents,
                "embeddings": embeddings,
                "file_chunks": file_chunks
            }, f)
    
    return documents, embeddings, file_chunks

# Función para generar respuesta usando Gemini
def generate_response(prompt, context=None):
    if context:
        full_prompt = f"""Contexto: {context}

Pregunta: {prompt}

Por favor, responde a la pregunta basándote en el contexto proporcionado tu eres un experto en el TEST DE RORSCHACH."""
    else:
        full_prompt = f"""Pregunta: {prompt}

Por favor, responde a la pregunta utilizando tu conocimiento general."""

    response_with_context = chat.send_message(full_prompt).text
    response_general = chat.send_message(f"Pregunta: {prompt}\n\nPor favor, responde a la pregunta utilizando tu conocimiento general.").text

    combined_response = f"{response_with_context}\n\nEn un contexto general: {response_general}"
    return combined_response

# Función para reducir el contexto si es necesario
def reduce_context(context, max_tokens=8000):
    words = context.split()
    if len(words) > max_tokens:
        return " ".join(words[:max_tokens])
    return context

# Cargar documentos y crear embeddings
directory = "./data"
documents, embeddings, file_chunks = load_or_update_cache(directory)

# Interfaz de Streamlit
st.title("Pregúntale al Búho")

# Entrada del usuario
user_input = st.text_input("Tu duda:", key="user_input")

# Generar respuesta
if st.button("Enviar"):
    if user_input:
        # Buscar en los documentos si hay embeddings válidos
        if embeddings is not None and len(documents) > 0:
            question_embedding = encoder.encode([user_input])
            similarities = cosine_similarity(question_embedding, embeddings)
            most_similar_idx = np.argmax(similarities)
            retrieved_doc = reduce_context(documents[most_similar_idx])
            
            # Verificar si el documento recuperado es relevante
            if similarities[0][most_similar_idx] > 0.07:  # Umbral de similitud
                response = generate_response(user_input, context=retrieved_doc)
            else:
                response = generate_response(user_input)  # Usar conocimiento general
        else:
            response = generate_response(user_input)  # Usar conocimiento general
        
        st.text_area("Respuesta del Búho:", value=response, height=300)

# Mostrar información sobre los chunks de archivos al final (comentado)
# if file_chunks:
#     st.markdown("---")  # Añadir una línea divisoria para separar visualmente
#     st.subheader("Información sobre los archivos procesados:")
#     for file, num_chunks in file_chunks.items():
#         st.write(f"- {file}: {num_chunks} chunks")

if __name__ == "__main__":
    pass