\ import os, glob, json from pathlib import Path from sentence_transformers import SentenceTransformer import faiss import numpy as np POLICY_DIR = "policies" STORE_DIR = "rag_store" META_PATH = os.path.join(STORE_DIR, "meta.json") INDEX_PATH = os.path.join(STORE_DIR, "index.faiss") MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" def read_text_like(path: str) -> str: if path.lower().endswith((".txt", ".md")): return Path(path).read_text(encoding="utf-8", errors="ignore") return "" def chunk(text: str, size=800, overlap=100): i = 0 n = len(text) while i < n: yield text[i : i + size] i += size - overlap def main(): os.makedirs(STORE_DIR, exist_ok=True) files = sorted( [p for p in glob.glob(os.path.join(POLICY_DIR, "**", "*"), recursive=True) if os.path.isfile(p)] ) docs = [] for fp in files: txt = read_text_like(fp) if not txt.strip(): continue for ch in chunk(txt): docs.append({"text": ch, "source": os.path.relpath(fp)}) if not docs: raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'") model = SentenceTransformer(MODEL_NAME) texts = [d["text"] for d in docs] embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True) index = faiss.IndexFlatIP(embs.shape[1]) index.add(embs.astype(np.float32)) faiss.write_index(index, INDEX_PATH) with open(META_PATH, "w", encoding="utf-8") as f: json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False) print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}") if __name__ == "__main__": main()