File size: 1,708 Bytes
f051f2e
b1c2b18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
\
import os, glob, json
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

POLICY_DIR = "policies"
STORE_DIR = "rag_store"
META_PATH = os.path.join(STORE_DIR, "meta.json")
INDEX_PATH = os.path.join(STORE_DIR, "index.faiss")
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

def read_text_like(path: str) -> str:
    if path.lower().endswith((".txt", ".md")):
        return Path(path).read_text(encoding="utf-8", errors="ignore")
    return ""

def chunk(text: str, size=800, overlap=100):
    i = 0
    n = len(text)
    while i < n:
        yield text[i : i + size]
        i += size - overlap

def main():
    os.makedirs(STORE_DIR, exist_ok=True)
    files = sorted(
        [p for p in glob.glob(os.path.join(POLICY_DIR, "**", "*"), recursive=True)
         if os.path.isfile(p)]
    )
    docs = []
    for fp in files:
        txt = read_text_like(fp)
        if not txt.strip():
            continue
        for ch in chunk(txt):
            docs.append({"text": ch, "source": os.path.relpath(fp)})

    if not docs:
        raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'")

    model = SentenceTransformer(MODEL_NAME)
    texts = [d["text"] for d in docs]
    embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs.astype(np.float32))

    faiss.write_index(index, INDEX_PATH)
    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False)

    print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}")

if __name__ == "__main__":
    main()