CreditCopilot / rag_store.py
okara chidera
chore: refactored code
4e36c6c unverified
from __future__ import annotations
from typing import List, Tuple
import numpy as np
from models import embed_texts
from text_utils import chunk_text, read_pdf_text
try:
import faiss # type: ignore
FAISS_OK = True
except Exception:
FAISS_OK = False
class RAGStore:
def __init__(self):
self.docs: List[str] = []
self.doc_ids: List[Tuple[int, int]] = []
self.embs: np.ndarray | None = None
self.index = None
def ingest(self, files) -> Tuple[int, int, str]:
self.docs, self.doc_ids = [], []
combined_text: List[str] = []
for file_idx, file in enumerate(files or []):
text = read_pdf_text(file)
chunks = chunk_text(text)
self.docs.extend(chunks)
self.doc_ids.extend([(file_idx, chunk_idx) for chunk_idx in range(len(chunks))])
combined_text.append(text)
return len(files or []), len(self.docs), "\n".join(combined_text)
def build(self) -> int:
if not self.docs:
return 0
self.embs = embed_texts(self.docs).astype("float32")
if FAISS_OK:
dim = self.embs.shape[1]
self.index = faiss.IndexFlatIP(dim)
self.index.add(self.embs)
return len(self.docs)
def search(self, query: str, k: int = 5) -> List[str]:
if not self.docs:
return []
query_vec = embed_texts([query]).astype("float32")[0]
if self.index is not None:
_, indices = self.index.search(np.expand_dims(query_vec, 0), k)
ranked_indices = indices[0].tolist()
else:
sims = self.embs @ query_vec # type: ignore[operator]
ranked_indices = np.argsort(-sims)[:k].tolist()
return [self.docs[idx] for idx in ranked_indices if idx is not None]
RAG = RAGStore()