AsyncRAG

Sleeping

AsyncRAG / ingest.py

Zubaish

update

2194516 about 1 month ago

3.24 kB

	import os
	import shutil
	from huggingface_hub import hf_hub_download, list_repo_files
	from langchain_community.document_loaders import Docx2txtLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma
	from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN

	def run_ingestion():
	# 1. Environment Cleanup & Setup
	# Using absolute paths from config (e.g., /app/kb and /app/chroma_db)
	if os.path.exists(KB_DIR):
	shutil.rmtree(KB_DIR)
	if os.path.exists(CHROMA_DIR):
	shutil.rmtree(CHROMA_DIR)

	os.makedirs(KB_DIR, exist_ok=True)
	os.makedirs(CHROMA_DIR, exist_ok=True)

	print(f"⬇️ Listing files in repository: {HF_DATASET_REPO}...")

	try:
	# 2. Direct File Download (Bypassing load_dataset to avoid PDF errors)
	# This only fetches .docx files to keep your Gandhi ji knowledge base clean
	all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
	docx_files = [f for f in all_files if f.lower().endswith(".docx")]

	if not docx_files:
	print("❌ Error: No .docx files found in the dataset repository.")
	return

	docs = []
	for file_name in docx_files:
	print(f"📂 Downloading {file_name}...")
	# Download to HF cache first
	temp_path = hf_hub_download(
	repo_id=HF_DATASET_REPO,
	filename=file_name,
	repo_type="dataset",
	token=HF_TOKEN
	)
	# Copy to our predictable /app/kb directory
	local_docx = os.path.join(KB_DIR, os.path.basename(file_name))
	shutil.copy(temp_path, local_docx)

	# 3. Load text from Docx (ignores images automatically)
	loader = Docx2txtLoader(local_docx)
	docs.extend(loader.load())
	print(f"✅ Text extracted from: {file_name}")

	if not docs:
	print("❌ Error: Extracted document list is empty.")
	return

	# 4. Text Splitting (Optimized for RAG context windows)
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=CHUNK_SIZE,
	chunk_overlap=CHUNK_OVERLAP,
	add_start_index=True
	)
	splits = text_splitter.split_documents(docs)
	print(f"✂️ Split into {len(splits)} text chunks.")

	# 5. Embedding & Vector Store Creation
	print(f"🧠 Generating embeddings with {EMBEDDING_MODEL}...")
	embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

	# Save to the persistent directory specified in config (/app/chroma_db)
	print(f"💾 Saving Vector Database to {CHROMA_DIR}...")
	Chroma.from_documents(
	documents=splits,
	embedding=embeddings,
	persist_directory=CHROMA_DIR
	)
	print(f"✨ Knowledge base fully initialized and saved.")

	except Exception as e:
	print(f"❌ CRITICAL INGESTION ERROR: {str(e)}")

	if __name__ == "__main__":
	run_ingestion()