Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from typing import List, Tuple | |
| import numpy as np | |
| from models import embed_texts | |
| from text_utils import chunk_text, read_pdf_text | |
| try: | |
| import faiss # type: ignore | |
| FAISS_OK = True | |
| except Exception: | |
| FAISS_OK = False | |
| class RAGStore: | |
| def __init__(self): | |
| self.docs: List[str] = [] | |
| self.doc_ids: List[Tuple[int, int]] = [] | |
| self.embs: np.ndarray | None = None | |
| self.index = None | |
| def ingest(self, files) -> Tuple[int, int, str]: | |
| self.docs, self.doc_ids = [], [] | |
| combined_text: List[str] = [] | |
| for file_idx, file in enumerate(files or []): | |
| text = read_pdf_text(file) | |
| chunks = chunk_text(text) | |
| self.docs.extend(chunks) | |
| self.doc_ids.extend([(file_idx, chunk_idx) for chunk_idx in range(len(chunks))]) | |
| combined_text.append(text) | |
| return len(files or []), len(self.docs), "\n".join(combined_text) | |
| def build(self) -> int: | |
| if not self.docs: | |
| return 0 | |
| self.embs = embed_texts(self.docs).astype("float32") | |
| if FAISS_OK: | |
| dim = self.embs.shape[1] | |
| self.index = faiss.IndexFlatIP(dim) | |
| self.index.add(self.embs) | |
| return len(self.docs) | |
| def search(self, query: str, k: int = 5) -> List[str]: | |
| if not self.docs: | |
| return [] | |
| query_vec = embed_texts([query]).astype("float32")[0] | |
| if self.index is not None: | |
| _, indices = self.index.search(np.expand_dims(query_vec, 0), k) | |
| ranked_indices = indices[0].tolist() | |
| else: | |
| sims = self.embs @ query_vec # type: ignore[operator] | |
| ranked_indices = np.argsort(-sims)[:k].tolist() | |
| return [self.docs[idx] for idx in ranked_indices if idx is not None] | |
| RAG = RAGStore() | |