Spaces:
Sleeping
Sleeping
File size: 1,708 Bytes
f051f2e b1c2b18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
\
import os, glob, json
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
POLICY_DIR = "policies"
STORE_DIR = "rag_store"
META_PATH = os.path.join(STORE_DIR, "meta.json")
INDEX_PATH = os.path.join(STORE_DIR, "index.faiss")
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def read_text_like(path: str) -> str:
if path.lower().endswith((".txt", ".md")):
return Path(path).read_text(encoding="utf-8", errors="ignore")
return ""
def chunk(text: str, size=800, overlap=100):
i = 0
n = len(text)
while i < n:
yield text[i : i + size]
i += size - overlap
def main():
os.makedirs(STORE_DIR, exist_ok=True)
files = sorted(
[p for p in glob.glob(os.path.join(POLICY_DIR, "**", "*"), recursive=True)
if os.path.isfile(p)]
)
docs = []
for fp in files:
txt = read_text_like(fp)
if not txt.strip():
continue
for ch in chunk(txt):
docs.append({"text": ch, "source": os.path.relpath(fp)})
if not docs:
raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'")
model = SentenceTransformer(MODEL_NAME)
texts = [d["text"] for d in docs]
embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
index = faiss.IndexFlatIP(embs.shape[1])
index.add(embs.astype(np.float32))
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "w", encoding="utf-8") as f:
json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False)
print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}")
if __name__ == "__main__":
main()
|