| import os
|
| import chromadb
|
| from pathlib import Path
|
| from dotenv import load_dotenv
|
| from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
|
| from llama_index.vector_stores.chroma import ChromaVectorStore
|
| from llama_index.embeddings.cohere import CohereEmbedding
|
|
|
|
|
| load_dotenv()
|
|
|
|
|
| cohere_api_key = os.getenv("COHERE_API_KEY")
|
| if not cohere_api_key:
|
| raise ValueError("COHERE_API_KEY not found in environment variables")
|
|
|
|
|
| embed_model = CohereEmbedding(
|
| cohere_api_key=cohere_api_key,
|
| model_name="embed-english-v3.0",
|
| input_type="search_document"
|
| )
|
|
|
|
|
| Settings.embed_model = embed_model
|
|
|
| def process_documents(department: str, base_dir: str = "./resources/data"):
|
| """
|
| Process and index documents for a specific department
|
|
|
| Args:
|
| department: The department name (e.g., 'hr', 'engineering')
|
| base_dir: Base directory containing department folders
|
| """
|
| print(f"Processing documents for {department} department...")
|
|
|
|
|
| dept_path = Path(base_dir) / department
|
| general_path = Path(base_dir) / "general"
|
| persist_dir = f"./chroma_db/{department}"
|
|
|
|
|
| os.makedirs(persist_dir, exist_ok=True)
|
|
|
|
|
| chroma_client = chromadb.PersistentClient(path=persist_dir)
|
|
|
|
|
| try:
|
| chroma_client.delete_collection("documents")
|
| except:
|
| pass
|
|
|
|
|
| chroma_collection = chroma_client.get_or_create_collection("documents")
|
|
|
|
|
| vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
| storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
|
|
| documents = []
|
|
|
|
|
| if dept_path.exists() and dept_path.is_dir():
|
| for file_path in dept_path.glob("*"):
|
| if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
|
| print(f"Processing {file_path.name}...")
|
| try:
|
|
|
| with open(file_path, 'r', encoding='utf-8') as f:
|
| content = f.read()
|
|
|
|
|
| from llama_index.core import Document
|
| doc = Document(
|
| text=content,
|
| metadata={
|
| "source": str(file_path.name),
|
| "department": department,
|
| "type": "department_specific"
|
| }
|
| )
|
| documents.append(doc)
|
| except Exception as e:
|
| print(f"Error processing {file_path}: {str(e)}")
|
|
|
|
|
| if general_path.exists() and general_path.is_dir():
|
| for file_path in general_path.glob("*"):
|
| if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
|
| print(f"Processing general document: {file_path.name}...")
|
| try:
|
|
|
| with open(file_path, 'r', encoding='utf-8') as f:
|
| content = f.read()
|
|
|
|
|
| from llama_index.core import Document
|
| doc = Document(
|
| text=content,
|
| metadata={
|
| "source": str(file_path.name),
|
| "department": "general",
|
| "type": "general"
|
| }
|
| )
|
| documents.append(doc)
|
| except Exception as e:
|
| print(f"Error processing general document {file_path}: {str(e)}")
|
|
|
| if not documents:
|
| print(f"No documents found for {department} department.")
|
| return
|
|
|
| print(f"Indexing {len(documents)} documents...")
|
|
|
|
|
| index = VectorStoreIndex.from_documents(
|
| documents,
|
| storage_context=storage_context,
|
| show_progress=True,
|
| embed_model=embed_model
|
| )
|
|
|
| print(f"✅ Successfully indexed {len(documents)} documents for {department} department")
|
| print(f"Index stored in: {persist_dir}")
|
|
|
| def main():
|
| """Main function to process documents for all departments"""
|
| departments = ["hr", "engineering", "finance", "marketing"]
|
|
|
| for dept in departments:
|
| print(f"\n{'='*50}")
|
| print(f"Processing {dept.upper()} department")
|
| print(f"{'='*50}")
|
| process_documents(dept)
|
|
|
| print("\n✅ Document processing completed for all departments!")
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|