| | import os |
| | import shutil |
| | from huggingface_hub import hf_hub_download, list_repo_files |
| | from langchain_community.document_loaders import Docx2txtLoader |
| | from langchain_text_splitters import RecursiveCharacterTextSplitter |
| | from langchain_huggingface import HuggingFaceEmbeddings |
| | from langchain_chroma import Chroma |
| | from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN |
| |
|
| | def run_ingestion(): |
| | |
| | |
| | if os.path.exists(KB_DIR): |
| | shutil.rmtree(KB_DIR) |
| | if os.path.exists(CHROMA_DIR): |
| | shutil.rmtree(CHROMA_DIR) |
| | |
| | os.makedirs(KB_DIR, exist_ok=True) |
| | os.makedirs(CHROMA_DIR, exist_ok=True) |
| |
|
| | print(f"β¬οΈ Listing files in repository: {HF_DATASET_REPO}...") |
| | |
| | try: |
| | |
| | |
| | all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN) |
| | docx_files = [f for f in all_files if f.lower().endswith(".docx")] |
| | |
| | if not docx_files: |
| | print("β Error: No .docx files found in the dataset repository.") |
| | return |
| |
|
| | docs = [] |
| | for file_name in docx_files: |
| | print(f"π Downloading {file_name}...") |
| | |
| | temp_path = hf_hub_download( |
| | repo_id=HF_DATASET_REPO, |
| | filename=file_name, |
| | repo_type="dataset", |
| | token=HF_TOKEN |
| | ) |
| | |
| | local_docx = os.path.join(KB_DIR, os.path.basename(file_name)) |
| | shutil.copy(temp_path, local_docx) |
| | |
| | |
| | loader = Docx2txtLoader(local_docx) |
| | docs.extend(loader.load()) |
| | print(f"β
Text extracted from: {file_name}") |
| |
|
| | if not docs: |
| | print("β Error: Extracted document list is empty.") |
| | return |
| |
|
| | |
| | text_splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=CHUNK_SIZE, |
| | chunk_overlap=CHUNK_OVERLAP, |
| | add_start_index=True |
| | ) |
| | splits = text_splitter.split_documents(docs) |
| | print(f"βοΈ Split into {len(splits)} text chunks.") |
| |
|
| | |
| | print(f"π§ Generating embeddings with {EMBEDDING_MODEL}...") |
| | embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) |
| | |
| | |
| | print(f"πΎ Saving Vector Database to {CHROMA_DIR}...") |
| | Chroma.from_documents( |
| | documents=splits, |
| | embedding=embeddings, |
| | persist_directory=CHROMA_DIR |
| | ) |
| | print(f"β¨ Knowledge base fully initialized and saved.") |
| |
|
| | except Exception as e: |
| | print(f"β CRITICAL INGESTION ERROR: {str(e)}") |
| |
|
| | if __name__ == "__main__": |
| | run_ingestion() |