# rag.py
import os
import json
import pickle
import logging
from typing import List, Tuple, Optional

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

from config import VECTORSTORE_DIR, EMBEDDING_MODEL

log = logging.getLogger(__name__)


class RAGAgent:
    """
    Loads a FAISS index + metadata from VECTORSTORE_DIR (config).
    Provides retrieve(query, k) -> (contexts: List[str], sources: List[dict])
    """

    def __init__(self, vectorstore_dir: Optional[str] = None, embedding_model: Optional[str] = None):
        self.vectorstore_dir = vectorstore_dir or str(VECTORSTORE_DIR)
        self.embedding_model_name = embedding_model or EMBEDDING_MODEL
        self.index: Optional[faiss.Index] = None
        self.metadata: Optional[List[dict]] = None
        self._embedder: Optional[SentenceTransformer] = None
        self._loaded = False

    def _find_index_file(self) -> Optional[str]:
        if not os.path.isdir(self.vectorstore_dir):
            log.warning("Vectorstore dir not found: %s", self.vectorstore_dir)
            return None

        for fname in os.listdir(self.vectorstore_dir):
            if fname.endswith((".faiss", ".index", ".bin")) or fname.startswith("index"):
                return os.path.join(self.vectorstore_dir, fname)
        return None

    def _find_meta_file(self) -> Optional[str]:
        if not os.path.isdir(self.vectorstore_dir):
            return None

        for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
            p = os.path.join(self.vectorstore_dir, candidate)
            if os.path.exists(p):
                return p

        for fname in os.listdir(self.vectorstore_dir):
            if fname.endswith(".pkl") or fname.endswith(".json"):
                return os.path.join(self.vectorstore_dir, fname)

        return None

    @property
    def embedder(self) -> SentenceTransformer:
        if self._embedder is None:
            log.info("Loading embedder: %s", self.embedding_model_name)
            self._embedder = SentenceTransformer(self.embedding_model_name)
        return self._embedder

    def load(self) -> None:
        """Load index and metadata into memory (idempotent)."""
        if self._loaded:
            return

        idx_path = self._find_index_file()
        meta_path = self._find_meta_file()

        if not idx_path or not meta_path:
            log.warning("No index/metadata found in %s — retrieval disabled.", self.vectorstore_dir)
            return

        log.info("Loading FAISS index from: %s", idx_path)
        try:
            self.index = faiss.read_index(idx_path)
        except Exception as e:
            log.error("Failed to read FAISS index: %s", e)
            return

        log.info("Loading metadata from: %s", meta_path)
        try:
            if meta_path.endswith(".json"):
                with open(meta_path, "r", encoding="utf-8") as f:
                    self.metadata = json.load(f)
            else:
                with open(meta_path, "rb") as f:
                    self.metadata = pickle.load(f)
        except Exception as e:
            log.error("Failed to read metadata: %s", e)
            return

        # Normalize metadata type
        if not isinstance(self.metadata, list):
            if isinstance(self.metadata, dict):
                try:
                    self.metadata = [self.metadata[k] for k in sorted(self.metadata.keys())]
                except Exception:
                    self.metadata = list(self.metadata.values())
            else:
                self.metadata = list(self.metadata)

        log.info("Loaded index and metadata: metadata length=%d", len(self.metadata))
        self._loaded = True

    def retrieve(self, query: str, k: int = 3) -> Tuple[List[str], List[dict]]:
        """
        Return two lists:
        - contexts: [str, ...] top-k chunk texts (may be fewer)
        - sources: [ {meta..., "score": float}, ... ]
        """
        if not self._loaded:
            self.load()

        if self.index is None or self.metadata is None:
            return [], []

        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")

        # try normalize if index uses normalized vectors
        try:
            faiss.normalize_L2(q_emb)
        except Exception:
            pass

        try:
            D, I = self.index.search(q_emb, k)
        except Exception as e:
            log.warning("FAISS search error: %s", e)
            return [], []

        if I is None or D is None:
            return [], []

        indices = np.array(I).reshape(-1)[:k].tolist()
        scores = np.array(D).reshape(-1)[:k].tolist()

        contexts, sources = [], []
        for idx, score in zip(indices, scores):
            if int(idx) < 0 or idx >= len(self.metadata):
                continue

            meta = self.metadata[int(idx)]
            text = None

            if isinstance(meta, dict):
                for key in ("text", "page_content", "content", "chunk_text", "source_text"):
                    if key in meta and meta[key]:
                        text = meta[key]
                        break
                if text is None and "metadata" in meta and isinstance(meta["metadata"], dict):
                    text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
            elif isinstance(meta, str):
                text = meta

            if text is None:
                text = str(meta)

            contexts.append(text)
            sources.append({"meta": meta, "score": float(score)})

        return contexts, sources