from __future__ import annotations from typing import List, Tuple import numpy as np from models import embed_texts from text_utils import chunk_text, read_pdf_text try: import faiss # type: ignore FAISS_OK = True except Exception: FAISS_OK = False class RAGStore: def __init__(self): self.docs: List[str] = [] self.doc_ids: List[Tuple[int, int]] = [] self.embs: np.ndarray | None = None self.index = None def ingest(self, files) -> Tuple[int, int, str]: self.docs, self.doc_ids = [], [] combined_text: List[str] = [] for file_idx, file in enumerate(files or []): text = read_pdf_text(file) chunks = chunk_text(text) self.docs.extend(chunks) self.doc_ids.extend([(file_idx, chunk_idx) for chunk_idx in range(len(chunks))]) combined_text.append(text) return len(files or []), len(self.docs), "\n".join(combined_text) def build(self) -> int: if not self.docs: return 0 self.embs = embed_texts(self.docs).astype("float32") if FAISS_OK: dim = self.embs.shape[1] self.index = faiss.IndexFlatIP(dim) self.index.add(self.embs) return len(self.docs) def search(self, query: str, k: int = 5) -> List[str]: if not self.docs: return [] query_vec = embed_texts([query]).astype("float32")[0] if self.index is not None: _, indices = self.index.search(np.expand_dims(query_vec, 0), k) ranked_indices = indices[0].tolist() else: sims = self.embs @ query_vec # type: ignore[operator] ranked_indices = np.argsort(-sims)[:k].tolist() return [self.docs[idx] for idx in ranked_indices if idx is not None] RAG = RAGStore()