| |
| """ |
| Seed the ChromaDB 'company_policies' collection with sample policy documents. |
| Run once: python3 seed_vector_store.py |
| |
| Requires Ollama with nomic-embed-text: ollama pull nomic-embed-text |
| """ |
|
|
| from pathlib import Path |
|
|
| from dotenv import load_dotenv |
|
|
| load_dotenv(Path(__file__).parent / ".env") |
|
|
| from langchain_chroma import Chroma |
| from langchain_ollama import OllamaEmbeddings |
| from langchain_core.documents import Document |
|
|
| COLLECTION_NAME = "company_policies" |
| PERSIST_DIR = Path(__file__).parent / "data" / "chroma_db" |
|
|
| SAMPLE_POLICIES = [ |
| Document( |
| page_content="""Expense Policy: All business expenses must be pre-approved for amounts over $500. |
| Submit receipts within 30 days. Air travel must be economy class unless trip exceeds 8 hours. |
| Maximum daily meal allowance: $75 for domestic, $100 for international.""", |
| metadata={"source": "expense_policy.pdf", "type": "policy"}, |
| ), |
| Document( |
| page_content="""Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. |
| Core hours 10am-3pm local time are required. VPN must be used for all company systems. |
| Equipment reimbursement up to $500 for home office setup.""", |
| metadata={"source": "remote_work.md", "type": "policy"}, |
| ), |
| Document( |
| page_content="""Leave Policy: Full-time employees receive 15 days PTO per year, 10 sick days. |
| Unused PTO carries over up to 5 days. bereavement leave: 5 days. Parental leave: 12 weeks paid.""", |
| metadata={"source": "leave_policy.pdf", "type": "policy"}, |
| ), |
| Document( |
| page_content="""Data Security Policy: All customer data must be encrypted at rest and in transit. |
| Access to production databases requires 2FA and manager approval. No PII in logs or error messages. |
| Incident reporting within 24 hours.""", |
| metadata={"source": "security_policy.pdf", "type": "policy"}, |
| ), |
| Document( |
| page_content="""Code Review Process: All PRs require 2 approvals before merge. Run tests locally. |
| No direct commits to main. Use feature branches. Document breaking changes in CHANGELOG.""", |
| metadata={"source": "engineering_handbook.md", "type": "documentation"}, |
| ), |
| ] |
|
|
|
|
| def main(): |
| persist = PERSIST_DIR |
| persist.mkdir(parents=True, exist_ok=True) |
| embeddings = OllamaEmbeddings(model="nomic-embed-text") |
| |
| try: |
| existing = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=str(persist)) |
| existing.delete_collection() |
| except Exception: |
| pass |
| Chroma.from_documents( |
| documents=SAMPLE_POLICIES, |
| embedding=embeddings, |
| collection_name=COLLECTION_NAME, |
| persist_directory=str(persist), |
| ) |
| print(f"Seeded {len(SAMPLE_POLICIES)} documents into ChromaDB collection '{COLLECTION_NAME}'.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|