snote / scripts /process_incoming_docs.py
xuanbao01's picture
Upload folder using huggingface_hub
44c5827 verified
import datetime
import json
import shutil
from ingest_manifest import ingest_manifest
from chunks_and_metadata import convert_md_to_chunks
from document_parser import convert_doc_to_md
from bm25_index import load_chunks, build_bm25_index, save_index, load_index
from embedding_index import main as build_embedding_index_main
import pathlib, re
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
BASE = pathlib.Path(__file__).resolve().parent.parent
CONVERTED = BASE / "converted"
CHUNKS_DIR = BASE / "chunks" #temp fix
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
INDEX_OUT = BASE / "bm25_index.pkl"
MAX_TOKENS = 512
OVERLAP_TOKENS = 50
BASE = pathlib.Path(__file__).resolve().parent.parent
RAW = BASE / "raw_docs"
# step 1: ingest docs to raw_docs
def ingest_manifest_step():
ingest_manifest()
# step 2: convert docs to markdown
def convert_docs_to_markdown_step():
for doc in RAW.iterdir():
if doc.suffix.lower() not in [".docx"]:
logger.info("Skipping:", doc); continue
out = CONVERTED / (doc.stem + ".md")
convert_doc_to_md(doc, out)
logger.info("Converted:", out)
# step 3: process md to chunks
def convert_md_to_chunks_step():
manifests = []
for md in CONVERTED.iterdir():
m = convert_md_to_chunks(md, CHUNKS_DIR)
manifests.extend(m)
with open(CHUNKS_DIR / "chunks_manifest.json", "w", encoding="utf-8") as f:
json.dump({"generated_at": datetime.datetime.utcnow().isoformat()+"Z", "chunks": manifests}, f, ensure_ascii=False, indent=2)
logger.info("Wrote", len(manifests), "chunks")
# step 4, 5: build bm25 index and embedding index
def build_bm25_index_step():
# delete existing bm25 index
if INDEX_OUT.exists():
INDEX_OUT.unlink()
chunks = load_chunks(CHUNKS_DIR)
bm25_index = build_bm25_index(chunks)
save_index(bm25_index, INDEX_OUT)
logger.info("Built bm25 index and saved to %s", INDEX_OUT)
def build_embedding_index_step():
import os
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
# delete existing embedding index
if os.path.exists(os.path.join(parent_dir, "chroma_db")):
shutil.rmtree(os.path.join(parent_dir, "chroma_db"))
build_embedding_index_main(
chunks_dir="chunks",
persist_dir= os.path.join(parent_dir, "chroma_db"),
collection="snote",
model_name="AITeamVN/Vietnamese_Embedding_v2",
batch_size=100,
device="cpu",
force_reembed=True
)
if __name__ == "__main__":
# ingest_manifest_step()
# convert_docs_to_markdown_step()
# convert_md_to_chunks_step()
# build_bm25_index_step()
build_embedding_index_step()