Spaces:
Running
Running
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| class RetrievalEngine: | |
| def __init__(self): | |
| self.dataset = load_dataset( | |
| "json", | |
| data_files="https://huggingface.co/datasets/j-js/gmat-quant-corpus/resolve/main/gmat_hf_chunks.jsonl", | |
| split="train", | |
| ) | |
| self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| texts = [row["text"] for row in self.dataset] | |
| embeddings = self.model.encode(texts, convert_to_numpy=True) | |
| dim = embeddings.shape[1] | |
| self.index = faiss.IndexFlatL2(dim) | |
| self.index.add(embeddings) | |
| self.texts = texts | |
| def search(self, query, k=3): | |
| q_emb = self.model.encode([query], convert_to_numpy=True) | |
| distances, indices = self.index.search(q_emb, k) | |
| results = [] | |
| for idx in indices[0]: | |
| results.append(self.texts[idx]) | |
| return results |