Spaces:

xuanbao01
/

snote

Runtime error

App Files Files Community

snote / scripts /process_incoming_docs.py

xuanbao01

Upload folder using huggingface_hub

44c5827 verified 4 months ago

raw

history blame contribute delete

2.77 kB

	import datetime
	import json
	import shutil
	from ingest_manifest import ingest_manifest
	from chunks_and_metadata import convert_md_to_chunks
	from document_parser import convert_doc_to_md
	from bm25_index import load_chunks, build_bm25_index, save_index, load_index
	from embedding_index import main as build_embedding_index_main
	import pathlib, re
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	BASE = pathlib.Path(__file__).resolve().parent.parent
	CONVERTED = BASE / "converted"
	CHUNKS_DIR = BASE / "chunks" #temp fix
	CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
	INDEX_OUT = BASE / "bm25_index.pkl"

	MAX_TOKENS = 512
	OVERLAP_TOKENS = 50
	BASE = pathlib.Path(__file__).resolve().parent.parent
	RAW = BASE / "raw_docs"


	# step 1: ingest docs to raw_docs
	def ingest_manifest_step():
	ingest_manifest()

	# step 2: convert docs to markdown
	def convert_docs_to_markdown_step():
	for doc in RAW.iterdir():
	if doc.suffix.lower() not in [".docx"]:
	logger.info("Skipping:", doc); continue
	out = CONVERTED / (doc.stem + ".md")
	convert_doc_to_md(doc, out)
	logger.info("Converted:", out)

	# step 3: process md to chunks
	def convert_md_to_chunks_step():
	manifests = []
	for md in CONVERTED.iterdir():
	m = convert_md_to_chunks(md, CHUNKS_DIR)
	manifests.extend(m)
	with open(CHUNKS_DIR / "chunks_manifest.json", "w", encoding="utf-8") as f:
	json.dump({"generated_at": datetime.datetime.utcnow().isoformat()+"Z", "chunks": manifests}, f, ensure_ascii=False, indent=2)
	logger.info("Wrote", len(manifests), "chunks")

	# step 4, 5: build bm25 index and embedding index
	def build_bm25_index_step():
	# delete existing bm25 index
	if INDEX_OUT.exists():
	INDEX_OUT.unlink()
	chunks = load_chunks(CHUNKS_DIR)
	bm25_index = build_bm25_index(chunks)
	save_index(bm25_index, INDEX_OUT)
	logger.info("Built bm25 index and saved to %s", INDEX_OUT)

	def build_embedding_index_step():
	import os
	current_dir = os.path.dirname(os.path.abspath(__file__))
	parent_dir = os.path.dirname(current_dir)
	# delete existing embedding index
	if os.path.exists(os.path.join(parent_dir, "chroma_db")):
	shutil.rmtree(os.path.join(parent_dir, "chroma_db"))
	build_embedding_index_main(
	chunks_dir="chunks",
	persist_dir= os.path.join(parent_dir, "chroma_db"),
	collection="snote",
	model_name="AITeamVN/Vietnamese_Embedding_v2",
	batch_size=100,
	device="cpu",
	force_reembed=True
	)

	if __name__ == "__main__":
	# ingest_manifest_step()
	# convert_docs_to_markdown_step()
	# convert_md_to_chunks_step()
	# build_bm25_index_step()
	build_embedding_index_step()