Spaces:

abrar-adnan
/

GeekBot

Sleeping

App Files Files Community

abrar-adnan commited on 20 days ago

Commit

6acfeaf

verified ·

1 Parent(s): 008a215

Initial commit

Browse files

Files changed (36) hide show

.env +12 -0
.gitattributes +1 -0
.gitignore +17 -0
.python-version +1 -0
LICENSE.md +16 -0
agents/__init.py__ +5 -0
agents/__pycache__/relevance_checker.cpython-310.pyc +0 -0
agents/__pycache__/research_agent.cpython-310.pyc +0 -0
agents/__pycache__/verification_agent.cpython-310.pyc +0 -0
agents/__pycache__/workflow.cpython-310.pyc +0 -0
agents/relevance_checker.py +88 -0
agents/research_agent.py +85 -0
agents/verification_agent.py +326 -0
agents/workflow.py +136 -0
app.py +422 -0
config/__init.py__ +4 -0
config/__pycache__/constants.cpython-310.pyc +0 -0
config/__pycache__/settings.cpython-310.pyc +0 -0
config/constants.py +8 -0
config/settings.py +39 -0
document_processor/__init.py__ +3 -0
document_processor/__pycache__/file_handler.cpython-310.pyc +0 -0
document_processor/file_handler.py +107 -0
pyproject.toml +19 -0
requirements-dev.txt +223 -0
requirements.txt +773 -0
retriever/__init.py__ +3 -0
retriever/__pycache__/builder.cpython-310.pyc +0 -0
retriever/builder.py +105 -0
test/ocr_test.pdf +0 -0
test/sample.png +3 -0
test/test1.py +89 -0
utils/__init.py__ +3 -0
utils/__pycache__/logging.cpython-310.pyc +0 -0
utils/logging.py +8 -0
uv.lock +0 -0

.env ADDED Viewed

	@@ -0,0 +1,12 @@

+# Ollama Configuration (local)
+OLLAMA_BASE_URL=http://localhost:11434
+OLLAMA_MODEL_RESEARCH=llama3.2:3b
+OLLAMA_MODEL_VERIFICATION=llama3.2:3b
+OLLAMA_MODEL_RELEVANCE=llama3.2:3b
+# Embedding Model (local)
+EMBEDDING_MODEL=all-MiniLM-L6-v2
+# Optional settings
+LOG_LEVEL=INFO
+CHROMA_DB_PATH=./chroma_db

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test/sample.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# ChromaDB files
+chroma_db/
+document_cache/
+chroma_db/chroma.sqlite3
+app.log
+examples/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10.18

LICENSE.md ADDED Viewed

	@@ -0,0 +1,16 @@

+## Non-Commercial License
+Copyright (c) [2025] [Hailey Thao Quach]
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, merge, publish, and distribute the Software, subject to the following conditions:
+1. **Non-Commercial Use Only**
+   This software is licensed for non-commercial purposes only. Commercial use, including but not limited to, selling, licensing, incorporating into for-profit products or services, or otherwise using the software for financial gain, is strictly prohibited without prior written permission from the copyright holder.
+2. **Attribution**
+   Any use of this software must include proper attribution to the original author(s) by retaining this license text in all copies or substantial portions of the software.
+3. **No Warranty**
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+For permissions beyond the scope of this license, please contact [hailey@haileyq.com].

agents/__init.py__ ADDED Viewed

	@@ -0,0 +1,5 @@

+from .research_agent import ResearchAgent
+from .verification_agent import VerificationAgent
+from .workflow import AgentWorkflow
+__all__ = ["ResearchAgent", "VerificationAgent", "AgentWorkflow"]

agents/__pycache__/relevance_checker.cpython-310.pyc ADDED Viewed

Binary file (3.41 kB). View file

agents/__pycache__/research_agent.cpython-310.pyc ADDED Viewed

Binary file (3.26 kB). View file

agents/__pycache__/verification_agent.cpython-310.pyc ADDED Viewed

Binary file (9.35 kB). View file

agents/__pycache__/workflow.cpython-310.pyc ADDED Viewed

Binary file (4.55 kB). View file

agents/relevance_checker.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from langchain_ollama import ChatOllama
+from config.settings import settings
+import re
+import logging
+logger = logging.getLogger(__name__)
+class RelevanceChecker:
+    def __init__(self):
+        # Initialize the local Ollama LLM
+        print("Initializing RelevanceChecker with Ollama (local)...")
+        self.llm = ChatOllama(
+            base_url=settings.OLLAMA_BASE_URL,
+            model=settings.OLLAMA_MODEL_RELEVANCE,
+            temperature=0,
+            num_predict=10,
+        )
+        print("Ollama LLM initialized successfully.")
+    def check(self, question: str, retriever, k=3) -> str:
+        """
+        1. Retrieve the top-k document chunks from the global retriever.
+        2. Combine them into a single text string.
+        3. Pass that text + question to the LLM for classification.
+        Returns: "CAN_ANSWER", "PARTIAL", or "NO_MATCH".
+        """
+        logger.debug(f"RelevanceChecker.check called with question='{question}' and k={k}")
+        # Retrieve doc chunks from the ensemble retriever
+        top_docs = retriever.invoke(question)
+        if not top_docs:
+            logger.debug("No documents returned from retriever.invoke(). Classifying as NO_MATCH.")
+            return "NO_MATCH"
+        # Combine the top k chunk texts into one string
+        document_content = "\n\n".join(doc.page_content for doc in top_docs[:k])
+        # Create a prompt for the LLM to classify relevance
+        prompt = f"""
+        You are an AI relevance checker between a user's question and provided document content.
+        **Instructions:**
+        - Classify how well the document content addresses the user's question.
+        - Respond with only one of the following labels: CAN_ANSWER, PARTIAL, NO_MATCH.
+        - Do not include any additional text or explanation.
+        **Labels:**
+        1) "CAN_ANSWER": The passages contain enough explicit information to fully answer the question.
+        2) "PARTIAL": The passages mention or discuss the question's topic but do not provide all the details needed for a complete answer.
+        3) "NO_MATCH": The passages do not discuss or mention the question's topic at all.
+        **Important:** If the passages mention or reference the topic or timeframe of the question in any way, even if incomplete, respond with "PARTIAL" instead of "NO_MATCH".
+        **Question:** {question}
+        **Passages:** {document_content}
+        **Respond ONLY with one of the following labels: CAN_ANSWER, PARTIAL, NO_MATCH**
+        """
+        # Call the LLM
+        try:
+            response = self.llm.invoke(prompt)
+            # Extract content from LangChain message
+            if hasattr(response, 'content'):
+                llm_response = response.content.strip().upper()
+            else:
+                llm_response = str(response).strip().upper()
+        except Exception as e:
+            logger.error(f"Error during model inference: {e}")
+            return "NO_MATCH"
+        logger.debug(f"LLM response: {llm_response}")
+        # Validate the response
+        valid_labels = {"CAN_ANSWER", "PARTIAL", "NO_MATCH"}
+        if llm_response not in valid_labels:
+            logger.debug("LLM did not respond with a valid label. Forcing 'NO_MATCH'.")
+            classification = "NO_MATCH"
+        else:
+            logger.debug(f"Classification recognized as '{llm_response}'.")
+            classification = llm_response
+        print(f"Checker response: {classification}")
+        return classification

agents/research_agent.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from langchain_ollama import OllamaLLM
+from langchain_ollama import ChatOllama
+from typing import Dict, List
+from langchain_core.documents.base import Document
+from config.settings import settings
+class ResearchAgent:
+    def __init__(self):
+        """
+        Initialize the research agent with local Ollama LLM.
+        """
+        print("Initializing ResearchAgent with Ollama (local)...")
+        self.llm = ChatOllama(
+            base_url=settings.OLLAMA_BASE_URL,
+            model=settings.OLLAMA_MODEL_RESEARCH,
+            temperature=0.3,
+            num_predict=300,  # max_tokens equivalent
+        )
+        print("Ollama LLM initialized successfully.")
+    def sanitize_response(self, response_text: str) -> str:
+        """
+        Sanitize the LLM's response by stripping unnecessary whitespace.
+        """
+        return response_text.strip()
+    def generate_prompt(self, question: str, context: str) -> str:
+        """
+        Generate a structured prompt for the LLM to generate a precise and factual answer.
+        """
+        prompt = f"""
+        You are an AI assistant designed to provide precise and factual answers based on the given context.
+        **Instructions:**
+        - Answer the following question using only the provided context.
+        - Be clear, concise, and factual.
+        - Return as much information as you can get from the context.
+        **Question:** {question}
+        **Context:**
+        {context}
+        **Provide your answer below:**
+        """
+        return prompt
+    def generate(self, question: str, documents: List[Document]) -> Dict:
+        """
+        Generate an initial answer using the provided documents.
+        """
+        print(f"ResearchAgent.generate called with question='{question}' and {len(documents)} documents.")
+        # Combine the top document contents into one string
+        context = "\n\n".join([doc.page_content for doc in documents])
+        print(f"Combined context length: {len(context)} characters.")
+        # Create a prompt for the LLM
+        prompt = self.generate_prompt(question, context)
+        print("Prompt created for the LLM.")
+        # Call the LLM to generate the answer
+        try:
+            print("Sending prompt to Ollama...")
+            response = self.llm.invoke(prompt)
+            print("LLM response received.")
+            # Extract content from LangChain message
+            if hasattr(response, 'content'):
+                llm_response = response.content
+            else:
+                llm_response = str(response)
+        except Exception as e:
+            print(f"Error during model inference: {e}")
+            raise RuntimeError("Failed to generate answer due to a model error.") from e
+        # Sanitize the response
+        draft_answer = self.sanitize_response(llm_response) if llm_response else "I cannot answer this question based on the provided documents."
+        print(f"Generated answer: {draft_answer}")
+        return {
+            "draft_answer": draft_answer,
+            "context_used": context
+        }

agents/verification_agent.py ADDED Viewed

	@@ -0,0 +1,326 @@

+from langchain_ollama import ChatOllama
+from typing import Dict, List
+from langchain_core.documents.base import Document
+from config.settings import settings
+class VerificationAgent:
+    def __init__(self):
+        """
+        Initialize the verification agent with local Ollama LLM.
+        """
+        print("Initializing VerificationAgent with Ollama (local)...")
+        self.llm = ChatOllama(
+            base_url=settings.OLLAMA_BASE_URL,
+            model=settings.OLLAMA_MODEL_VERIFICATION,
+            temperature=0.0,
+            num_predict=200,
+        )
+        print("Ollama LLM initialized successfully.")
+    def sanitize_response(self, response_text: str) -> str:
+        """
+        Sanitize the LLM's response by stripping unnecessary whitespace.
+        """
+        return response_text.strip()
+    def generate_prompt(self, answer: str, context: str) -> str:
+        """
+        Generate a structured prompt for the LLM to verify the answer against the context.
+        """
+        prompt = f"""You are a strict verification agent. Your task is to verify if an answer is supported by the provided context.
+CRITICAL RULES:
+1. ONLY use information from the context provided below. Do NOT use any external knowledge or assumptions.
+2. If a claim in the answer is NOT explicitly or implicitly supported by the context, mark it as unsupported.
+3. If the answer contradicts information in the context, mark it as a contradiction.
+4. If you cannot verify a claim using ONLY the context, mark it as unsupported.
+5. Be strict - do not assume or infer beyond what is clearly stated in the context.
+6. Respond EXACTLY in the format specified below - no additional text, explanations, or formatting.
+**VERIFICATION FORMAT (follow exactly):**
+Supported: YES
+Unsupported Claims: []
+Contradictions: []
+Relevant: YES
+Additional Details: None
+OR if unsupported/contradictions found:
+Supported: NO
+Unsupported Claims: [list each unsupported claim exactly as it appears in the answer]
+Contradictions: [list each contradiction exactly as it appears]
+Relevant: YES or NO
+Additional Details: [brief explanation of why claims are unsupported or contradicted]
+**Answer to verify:**
+{answer}
+**Context (use ONLY this for verification):**
+{context}
+**Your verification (respond ONLY with the format above):**
+"""
+        return prompt
+    def parse_verification_response(self, response_text: str) -> Dict:
+        """
+        Parse the LLM's verification response into a structured dictionary.
+        """
+        try:
+            # Normalize the response - remove markdown formatting, extra whitespace
+            response_text = response_text.strip()
+            # Remove any markdown code blocks if present
+            if response_text.startswith('```'):
+                lines = response_text.split('\n')
+                response_text = '\n'.join(lines[1:-1]) if len(lines) > 2 else response_text
+            print(f"[DEBUG] Parsing verification response (first 500 chars): {response_text[:500]}")
+            verification = {}
+            lines = response_text.split('\n')
+            for line in lines:
+                line = line.strip()
+                if not line or not ':' in line:
+                    continue
+                # Split on first colon only
+                parts = line.split(':', 1)
+                if len(parts) != 2:
+                    continue
+                key = parts[0].strip()
+                value = parts[1].strip()
+                # Normalize key names (case-insensitive matching)
+                key_lower = key.lower()
+                if 'supported' in key_lower:
+                    # Extract YES/NO, handle variations
+                    value_upper = value.upper()
+                    print(f"[DEBUG] Found 'Supported' key with value: '{value}' (upper: '{value_upper}')")
+                    if 'YES' in value_upper or 'TRUE' in value_upper or 'Y' == value_upper.strip():
+                        verification["Supported"] = "YES"
+                        print(f"[DEBUG] Set Supported to YES")
+                    elif 'NO' in value_upper or 'FALSE' in value_upper or 'N' == value_upper.strip():
+                        verification["Supported"] = "NO"
+                        print(f"[DEBUG] Set Supported to NO")
+                    else:
+                        # If value is empty or unclear, check if there are unsupported claims/contradictions
+                        # If no issues found later, default to YES; otherwise NO
+                        print(f"[DEBUG] Supported value unclear: '{value}', will decide based on claims/contradictions")
+                        verification["Supported"] = None  # Mark as undecided
+                elif 'unsupported' in key_lower:
+                    # Handle list parsing
+                    items = []
+                    value = value.strip()
+                    if value.lower() in ['none', 'n/a', '[]', '']:
+                        items = []
+                    elif value.startswith('[') and value.endswith(']'):
+                        # Parse list items
+                        list_content = value[1:-1].strip()
+                        if list_content:
+                            items = [item.strip().strip('"').strip("'").strip()
+                                    for item in list_content.split(',')
+                                    if item.strip()]
+                    else:
+                        # Single item or comma-separated without brackets
+                        items = [item.strip().strip('"').strip("'")
+                                for item in value.split(',')
+                                if item.strip() and item.strip().lower() not in ['none', 'n/a']]
+                    verification["Unsupported Claims"] = items
+                elif 'contradiction' in key_lower:
+                    # Handle list parsing (same logic as unsupported)
+                    items = []
+                    value = value.strip()
+                    if value.lower() in ['none', 'n/a', '[]', '']:
+                        items = []
+                    elif value.startswith('[') and value.endswith(']'):
+                        list_content = value[1:-1].strip()
+                        if list_content:
+                            items = [item.strip().strip('"').strip("'").strip()
+                                    for item in list_content.split(',')
+                                    if item.strip()]
+                    else:
+                        items = [item.strip().strip('"').strip("'")
+                                for item in value.split(',')
+                                if item.strip() and item.strip().lower() not in ['none', 'n/a']]
+                    verification["Contradictions"] = items
+                elif 'relevant' in key_lower:
+                    value_upper = value.upper()
+                    if 'YES' in value_upper or 'TRUE' in value_upper:
+                        verification["Relevant"] = "YES"
+                    elif 'NO' in value_upper or 'FALSE' in value_upper:
+                        verification["Relevant"] = "NO"
+                    else:
+                        verification["Relevant"] = "YES"  # Default to YES if unclear
+                elif 'additional' in key_lower or 'detail' in key_lower:
+                    if value.lower() in ['none', 'n/a', '']:
+                        verification["Additional Details"] = ""
+                    else:
+                        verification["Additional Details"] = value
+            # Ensure all required keys are present with defaults
+            if "Supported" not in verification or verification.get("Supported") is None:
+                # If undecided, check if there are unsupported claims or contradictions
+                unsupported_claims = verification.get("Unsupported Claims", [])
+                contradictions = verification.get("Contradictions", [])
+                if not unsupported_claims and not contradictions:
+                    verification["Supported"] = "YES"  # No issues found, default to YES
+                    print(f"[DEBUG] Supported was missing/undecided, but no claims/contradictions found, defaulting to YES")
+                else:
+                    verification["Supported"] = "NO"  # Issues found, default to NO
+                    print(f"[DEBUG] Supported was missing/undecided, but found {len(unsupported_claims)} unsupported claims and {len(contradictions)} contradictions, defaulting to NO")
+            if "Unsupported Claims" not in verification:
+                verification["Unsupported Claims"] = []
+            if "Contradictions" not in verification:
+                verification["Contradictions"] = []
+            if "Relevant" not in verification:
+                verification["Relevant"] = "YES"
+            if "Additional Details" not in verification:
+                verification["Additional Details"] = ""
+            print(f"[DEBUG] Final parsed verification: Supported={verification.get('Supported')}, Unsupported Claims={len(verification.get('Unsupported Claims', []))}, Contradictions={len(verification.get('Contradictions', []))}")
+            return verification
+        except Exception as e:
+            print(f"Error parsing verification response: {e}")
+            print(f"Response text was: {response_text}")
+            # Return a safe default
+            return {
+                "Supported": "NO",
+                "Unsupported Claims": [],
+                "Contradictions": [],
+                "Relevant": "NO",
+                "Additional Details": f"Parsing error: {str(e)}"
+            }
+    def format_verification_report(self, verification: Dict) -> str:
+        """
+        Format the verification report dictionary into a readable markdown-formatted report.
+        """
+        supported = verification.get("Supported", "NO")
+        unsupported_claims = verification.get("Unsupported Claims", [])
+        contradictions = verification.get("Contradictions", [])
+        relevant = verification.get("Relevant", "NO")
+        additional_details = verification.get("Additional Details", "")
+        # Use markdown formatting for better display
+        report = f"### Verification Report\n\n"
+        # Add status indicators
+        supported_icon = "✅" if supported == "YES" else "❌"
+        report += f"**Supported:** {supported_icon} {supported}\n\n"
+        if unsupported_claims:
+            report += f"**⚠️ Unsupported Claims:**\n"
+            for claim in unsupported_claims:
+                report += f"- {claim}\n"
+            report += "\n"
+        else:
+            report += f"**Unsupported Claims:** None\n\n"
+        if contradictions:
+            report += f"**🔴 Contradictions:**\n"
+            for contradiction in contradictions:
+                report += f"- {contradiction}\n"
+            report += "\n"
+        else:
+            report += f"**Contradictions:** None\n\n"
+        relevant_icon = "✅" if relevant == "YES" else "❌"
+        report += f"**Relevant:** {relevant_icon} {relevant}\n\n"
+        if additional_details and additional_details.lower() not in ['none', 'n/a', '']:
+            report += f"**Additional Details:**\n{additional_details}\n"
+        else:
+            report += f"**Additional Details:** None\n"
+        return report
+    def generate_out_of_context_report(self) -> str:
+        """
+        Generate a verification report for questions that are out of context.
+        """
+        verification = {
+            "Supported": "NO",
+            "Unsupported Claims": ["The question is not related to the provided documents."],
+            "Contradictions": [],
+            "Relevant": "NO",
+            "Additional Details": "The question cannot be answered using the provided documents as it is out of context."
+        }
+        return self.format_verification_report(verification)
+    def check(self, answer: str, documents: List[Document]) -> Dict:
+        """
+        Verify the answer against the provided documents.
+        """
+        print(f"VerificationAgent.check called with answer='{answer}' and {len(documents)} documents.")
+        # Combine all document contents into one string
+        # Limit context size to prevent token overflow (keep last 8000 chars if too long)
+        context_parts = [doc.page_content for doc in documents]
+        context = "\n\n".join(context_parts)
+        # Truncate context if too long (keep most recent content which is usually more relevant)
+        MAX_CONTEXT_LENGTH = 10000  # Approximate character limit
+        if len(context) > MAX_CONTEXT_LENGTH:
+            print(f"Context too long ({len(context)} chars), truncating to last {MAX_CONTEXT_LENGTH} chars")
+            context = context[-MAX_CONTEXT_LENGTH:]
+        print(f"Combined context length: {len(context)} characters.")
+        # Create a prompt for the LLM to verify the answer
+        prompt = self.generate_prompt(answer, context)
+        print("Prompt created for the LLM.")
+        # Call the LLM to generate the verification report
+        try:
+            print("Sending prompt to Ollama...")
+            response = self.llm.invoke(prompt)
+            print("LLM response received.")
+            # Extract content from LangChain message
+            if hasattr(response, 'content'):
+                llm_response = response.content
+            else:
+                llm_response = str(response)
+        except Exception as e:
+            print(f"Error during model inference: {e}")
+            raise RuntimeError("Failed to verify answer due to a model error.") from e
+        # Sanitize the response
+        sanitized_response = self.sanitize_response(llm_response) if llm_response else ""
+        if not sanitized_response:
+            print("LLM returned an empty response.")
+            verification_report = {
+                "Supported": "NO",
+                "Unsupported Claims": [],
+                "Contradictions": [],
+                "Relevant": "NO",
+                "Additional Details": "Empty response from the model."
+            }
+        else:
+            # Parse the response into the expected format
+            verification_report = self.parse_verification_response(sanitized_response)
+            if verification_report is None:
+                print("LLM did not respond with the expected format. Using default verification report.")
+                verification_report = {
+                    "Supported": "NO",
+                    "Unsupported Claims": [],
+                    "Contradictions": [],
+                    "Relevant": "NO",
+                    "Additional Details": "Failed to parse the model's response."
+                }
+        # Format the verification report into a paragraph
+        verification_report_formatted = self.format_verification_report(verification_report)
+        print(f"Verification report:\n{verification_report_formatted}")
+        print(f"Context used: {context}")
+        return {
+            "verification_report": verification_report_formatted,
+            "context_used": context
+        }

agents/workflow.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from langgraph.graph import StateGraph, END
+from typing import TypedDict, List, Dict
+from .research_agent import ResearchAgent
+from .verification_agent import VerificationAgent
+from .relevance_checker import RelevanceChecker
+from langchain_core.documents.base import Document
+# from langchain.retrievers import EnsembleRetriever
+from langchain_classic.retrievers.ensemble import EnsembleRetriever
+import logging
+logger = logging.getLogger(__name__)
+class AgentState(TypedDict):
+    question: str
+    documents: List[Document]
+    draft_answer: str
+    verification_report: str
+    is_relevant: bool
+    retriever: EnsembleRetriever
+class AgentWorkflow:
+    def __init__(self):
+        self.researcher = ResearchAgent()
+        self.verifier = VerificationAgent()
+        self.relevance_checker = RelevanceChecker()
+        self.compiled_workflow = self.build_workflow()  # Compile once during initialization
+    def build_workflow(self):
+        """Create and compile the multi-agent workflow."""
+        workflow = StateGraph(AgentState)
+        # Add nodes
+        workflow.add_node("check_relevance", self._check_relevance_step)
+        workflow.add_node("research", self._research_step)
+        workflow.add_node("verify", self._verification_step)
+        # Define edges
+        workflow.set_entry_point("check_relevance")
+        workflow.add_conditional_edges(
+            "check_relevance",
+            self._decide_after_relevance_check,
+            {
+                "relevant": "research",
+                "irrelevant": END
+            }
+        )
+        workflow.add_edge("research", "verify")
+        workflow.add_conditional_edges(
+            "verify",
+            self._decide_next_step,
+            {
+                "re_research": "research",
+                "end": END
+            }
+        )
+        return workflow.compile()
+    def _check_relevance_step(self, state: AgentState) -> Dict:
+        retriever = state["retriever"]
+        classification = self.relevance_checker.check(
+            question=state["question"],
+            retriever=retriever,
+            k=20
+        )
+        if classification == "CAN_ANSWER":
+            # We have enough info to proceed
+            return {"is_relevant": True}
+        elif classification == "PARTIAL":
+            # There's partial coverage, but we can still proceed
+            return {
+                "is_relevant": True
+            }
+        else:  # classification == "NO_MATCH"
+            # Generate verification report for out-of-context questions
+            verification_report = self.verifier.generate_out_of_context_report()
+            return {
+                "is_relevant": False,
+                "draft_answer": "This question isn't related (or there's no data) for your query. Please ask another question relevant to the uploaded document(s).",
+                "verification_report": verification_report
+            }
+    def _decide_after_relevance_check(self, state: AgentState) -> str:
+        decision = "relevant" if state["is_relevant"] else "irrelevant"
+        print(f"[DEBUG] _decide_after_relevance_check -> {decision}")
+        return decision
+    def full_pipeline(self, question: str, retriever: EnsembleRetriever):
+        try:
+            print(f"[DEBUG] Starting full_pipeline with question='{question}'")
+            documents = retriever.invoke(question)
+            logger.info(f"Retrieved {len(documents)} relevant documents (from .invoke)")
+            # print(documents)
+            initial_state = AgentState(
+                question=question,
+                documents=documents,
+                draft_answer="",
+                verification_report="",
+                is_relevant=False,
+                retriever=retriever
+            )
+            final_state = self.compiled_workflow.invoke(initial_state)
+            return {
+                "draft_answer": final_state["draft_answer"],
+                "verification_report": final_state["verification_report"]
+            }
+        except Exception as e:
+            logger.error(f"Workflow execution failed: {e}")
+            raise
+    def _research_step(self, state: AgentState) -> Dict:
+        print(f"[DEBUG] Entered _research_step with question='{state['question']}'")
+        result = self.researcher.generate(state["question"], state["documents"])
+        print("[DEBUG] Researcher returned draft answer.")
+        return {"draft_answer": result["draft_answer"]}
+    def _verification_step(self, state: AgentState) -> Dict:
+        print("[DEBUG] Entered _verification_step. Verifying the draft answer...")
+        result = self.verifier.check(state["draft_answer"], state["documents"])
+        print("[DEBUG] VerificationAgent returned a verification report.")
+        return {"verification_report": result["verification_report"]}
+    def _decide_next_step(self, state: AgentState) -> str:
+        verification_report = state["verification_report"]
+        print(f"[DEBUG] _decide_next_step with verification_report='{verification_report}'")
+        if "Supported: NO" in verification_report or "Relevant: NO" in verification_report:
+            logger.info("[DEBUG] Verification indicates re-research needed.")
+            return "re_research"
+        else:
+            logger.info("[DEBUG] Verification successful, ending workflow.")
+            return "end"

app.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import gradio as gr
+import hashlib
+from typing import List, Dict, Tuple
+import os
+import time
+from document_processor.file_handler import DocumentProcessor
+from retriever.builder import RetrieverBuilder
+from agents.workflow import AgentWorkflow
+from config import constants, settings
+from utils.logging import logger
+def main():
+    processor = DocumentProcessor()
+    retriever_builder = RetrieverBuilder()
+    workflow = AgentWorkflow()
+    # Define custom CSS for ChatGPT-like styling with dark sidebar
+    css = """
+    .sidebar {
+        background: #202123 !important;
+        border-right: 1px solid #343541 !important;
+        max-height: 90vh !important;
+        height: auto !important;
+        overflow-y: auto !important;
+        padding: 15px !important;
+        color: #ffffff !important;
+    }
+    .sidebar * {
+        color: #ffffff !important;
+    }
+    .sidebar label {
+        color: #d1d5db !important;
+    }
+    .sidebar input, .sidebar select, .sidebar textarea {
+        background: #343541 !important;
+        color: #ffffff !important;
+        border: 1px solid #565869 !important;
+    }
+    .main-container {
+        max-height: 90vh !important;
+        height: auto !important;
+        overflow-yL: auto !important;
+    }
+    .chat-area {
+        height: 100vh !important;
+        display: flex !important;
+        flex-direction: column !important;
+        padding: 10px !important;
+    }
+    .chatbot-container {
+        flex: 1 1 auto !important;
+        min-height: 300px !important;
+        max-height: calc(100vh - 280px) !important;
+        overflow: hidden !important;
+    }
+    .chatbot-container .gradio-chatbot {
+        height: 100% !important;
+        max-height: calc(100vh - 280px) !important;
+    }
+    .input-area {
+        margin-top: 10px !important;
+    }
+    .processing-status {
+        padding: 8px !important;
+        border-radius: 6px !important;
+        margin: 8px 0 !important;
+        font-size: 0.9em !important;
+    }
+    .success {
+        background: #d4edda !important;
+        color: #155724 !important;
+        border: 1px solid #c3e6cb !important;
+    }
+    .error {
+        background: #f8d7da !important;
+        color: #721c24 !important;
+        border: 1px solid #f5c6cb !important;
+    }
+    .progress-info {
+        font-size: 0.85em !important;
+        color: #666 !important;
+        margin-top: 5px !important;
+    }
+    """
+    with gr.Blocks(theme=gr.themes.Soft(), title="DocChat", css=css) as demo:
+        # Session state for document processing
+        session_state = gr.State({
+            "file_hashes": frozenset(),
+            "retriever": None,
+            "processed_files": [],
+            "chat_history": []
+        })
+        # Main layout: Sidebar + Chat
+        with gr.Row(equal_height=True, elem_classes="main-container"):
+            # Left Sidebar for file management (narrower)
+            with gr.Column(scale=0.7, min_width=250, elem_classes="sidebar"):
+                gr.Markdown("## 📁 Upload your documents here", elem_classes="title")
+                # File upload component - files shown here
+                files = gr.Files(
+                    label="Upload Documents",
+                    file_types=constants.ALLOWED_TYPES,
+                    height=150,
+                    show_label=True
+                )
+                # Sync button and status
+                with gr.Row():
+                    sync_btn = gr.Button("🔄 Sync", variant="primary", scale=1, size="sm", elem_classes=["flex-item"])
+                with gr.Row():
+                    sync_status_indicator = gr.HTML(
+                        '<div style="padding: 6px; text-align: center; border-radius: 4px; background: #343541; color: #9ca3af; font-size: 0.85em; width: 100%;">Not synced</div>',
+                        visible=True,
+                        elem_classes=["flex-item"]
+                    )
+                # Provide equal layout using some CSS tweaks
+                gr.HTML("""
+                <style>
+                    .svelte-1ipelgc.flex-item { flex: 1 1 0 !important; min-width: 0 !important; }
+                </style>
+                """)
+                # Processing status (only show when processing/processed)
+                processing_status = gr.Markdown("", elem_classes="processing-status", visible=False)
+                # Verification Report Section
+                gr.HTML("""
+                    <style>
+                    .compact-markdown p,
+                    .compact-markdown h4,
+                    .compact-markdown h5,
+                    .compact-markdown h6 {
+                        margin-top: 0.25rem !important;
+                        margin-bottom: 0.25rem !important;
+                    }
+                    </style>
+                    """)
+                # gr.Markdown("---")
+                gr.Markdown('<span style="font-size: 1em;">📊 Verification Report</span>', elem_classes="compact-markdown")
+                # gr.Markdown('<span style="font-size: 0.85em; color: #8e9297;"><em>Of the last message</em></span>', elem_classes="compact-markdown")
+                verification_output = gr.Textbox(
+                    label="",
+                    interactive=False,
+                    lines=12,
+                    max_lines=12,
+                    value="",
+                    placeholder="""### Verification Report""",
+                    show_label=False
+                )
+            # Right side: Chat interface
+            with gr.Column(scale=4, elem_classes="chat-area"):
+                # Header section
+                gr.Markdown("# 🤖 GeekBot")
+                gr.Markdown("*Your personal AI*")
+                gr.Markdown("*Enter your documents and start chatting about it. Supports ppt,pdf,txt, etc*")
+                # Chat interface - flex to fill available space
+                with gr.Column(elem_classes="chatbot-container"):
+                    chatbot = gr.Chatbot(
+                        label="",
+                        show_label=False,
+                        show_copy_button=True,
+                        avatar_images=(None, "🤖"),
+                        container=True,
+                        height=550
+                    )
+                # Input area
+                with gr.Row(elem_classes="input-area"):
+                    msg = gr.Textbox(
+                        label="",
+                        placeholder="Type your question here...",
+                        show_label=False,
+                        scale=9,
+                        container=False
+                    )
+                    submit_btn = gr.Button("Send", scale=1, variant="primary")
+        # Function to remove files from ChromaDB when they're removed from UI
+        def handle_file_removal(current_files: List, state: Dict):
+            """Handle file removal - clean up ChromaDB and retriever if files are removed."""
+            if not current_files:
+                # All files removed - reset retriever
+                if state.get("retriever"):
+                    logger.info("All files removed. Resetting retriever.")
+                    state.update({
+                        "retriever": None,
+                        "file_hashes": frozenset(),
+                        "processed_files": []
+                    })
+                    return (
+                        get_sync_status_html("ready"),
+                        "",  # processing_status
+                        gr.update(visible=False),  # processing_status visibility
+                        state
+                    )
+                return (
+                    get_sync_status_html("ready"),
+                    "",
+                    gr.update(visible=False),
+                    state
+                )
+            # Check if any files were removed
+            current_hashes = _get_file_hashes(current_files)
+            if state.get("file_hashes") and current_hashes != state["file_hashes"]:
+                # Files were removed - need to rebuild retriever with remaining files
+                logger.info("Files were removed. Rebuilding retriever with remaining files...")
+                try:
+                    chunks = processor.process(current_files)
+                    retriever = retriever_builder.build_hybrid_retriever(chunks)
+                    state.update({
+                        "file_hashes": current_hashes,
+                        "retriever": retriever,
+                        "processed_files": current_files
+                    })
+                    status_html = "✅ **Documents resynced**<br>"
+                    status_html += f"<div class='progress-info'>{len(chunks)} chunks indexed.</div>"
+                    return (
+                        get_sync_status_html("synced", len(chunks)),
+                        status_html,
+                        gr.update(visible=True),
+                        state
+                    )
+                except Exception as e:
+                    logger.error(f"Error resyncing after file removal: {e}")
+                    return (
+                        get_sync_status_html("error"),
+                        f"❌ Error: {str(e)}",
+                        gr.update(visible=True),
+                        state
+                    )
+            return (
+                get_sync_status_html("synced", len(state.get("processed_files", []))),
+                "",
+                gr.update(visible=False),
+                state
+            )
+        # Function to update sync status indicator
+        def get_sync_status_html(status: str, count: int = 0) -> str:
+            """Generate HTML for sync status indicator."""
+            if status == "synced":
+                return f'<div style="padding: 8px; text-align: center; border-radius: 4px; background: #16a34a; color: #ffffff; font-weight: bold;">✅ Synced ({count} chunks)</div>'
+            elif status == "syncing":
+                return '<div style="padding: 8px; text-align: center; border-radius: 4px; background: #f59e0b; color: #ffffff; font-weight: bold;">🔄 Syncing...</div>'
+            elif status == "error":
+                return '<div style="padding: 8px; text-align: center; border-radius: 4px; background: #dc2626; color: #ffffff; font-weight: bold;">❌ Error</div>'
+            else:
+                return '<div style="padding: 8px; text-align: center; border-radius: 4px; background: #343541; color: #9ca3af;">Not synced</div>'
+        # Function to process files (called by sync button)
+        def process_files(uploaded_files: List, state: Dict):
+            """Process files and build retriever."""
+            if not uploaded_files:
+                return (
+                    get_sync_status_html("ready"),  # sync_status
+                    "",  # processing_status
+                    gr.update(visible=False),  # processing_status visibility
+                    state
+                )
+            try:
+                current_hashes = _get_file_hashes(uploaded_files)
+                # Check if files are new or changed
+                if state["retriever"] is None or current_hashes != state["file_hashes"]:
+                    # Process documents
+                    logger.info("Processing new/changed documents...")
+                    chunks = processor.process(uploaded_files)
+                    logger.info("Building retriever...")
+                    retriever = retriever_builder.build_hybrid_retriever(chunks)
+                    logger.info("Retriever built successfully")
+                    state.update({
+                        "file_hashes": current_hashes,
+                        "retriever": retriever,
+                        "processed_files": uploaded_files
+                    })
+                    status_html = "✅ **Documents synced successfully!**<br>"
+                    status_html += f"<div class='progress-info'>{len(chunks)} chunks indexed. Ready for questions!</div>"
+                    return (
+                        get_sync_status_html("synced", len(chunks)),  # sync_status
+                        status_html,  # processing_status
+                        gr.update(visible=True),  # processing_status visibility
+                        state
+                    )
+                else:
+                    # Files unchanged, already synced
+                    status_html = "✅ **Documents already synced**<br>"
+                    status_html += "<div class='progress-info'>Files are up to date. Ready for questions!</div>"
+                    # Get chunk count from state if available
+                    chunk_count = len(state.get("processed_files", []))
+                    return (
+                        get_sync_status_html("synced", chunk_count),  # sync_status
+                        status_html,  # processing_status
+                        gr.update(visible=True),  # processing_status visibility
+                        state
+                    )
+            except Exception as e:
+                error_html = f"❌ **Error syncing documents**<br>"
+                error_html += f"<div class='progress-info'>{str(e)}</div>"
+                logger.error(f"File processing error: {str(e)}")
+                return (
+                    get_sync_status_html("error"),  # sync_status
+                    error_html,  # processing_status
+                    gr.update(visible=True),  # processing_status visibility
+                    state
+                )
+        # Chat function for handling questions
+        def chat_function(message: str, history: List, state: Dict, verification_state: str):
+            """Handle chat messages and generate responses."""
+            try:
+                if not message.strip():
+                    history.append((message, "Please enter a question."))
+                    return history, "", state, verification_state
+                if state["retriever"] is None:
+                    history.append((message, "❌ No documents uploaded. Please upload documents first."))
+                    return history, "", state, verification_state
+                # Get answer from workflow
+                result = workflow.full_pipeline(
+                    question=message,
+                    retriever=state["retriever"]
+                )
+                answer = result["draft_answer"]
+                verification_report = result["verification_report"]
+                # Add to chat history
+                history.append((message, answer))
+                # Update state
+                if "chat_history" not in state:
+                    state["chat_history"] = []
+                state["chat_history"].append({"question": message, "answer": answer})
+                return history, "", state, verification_report
+            except Exception as e:
+                logger.error(f"Chat error: {str(e)}")
+                error_msg = f"❌ Error: {str(e)}"
+                history.append((message, error_msg))
+                return history, "", state, ""
+        # Event handlers
+        # Handle file removal - check when files change
+        files.change(
+            fn=handle_file_removal,
+            inputs=[files, session_state],
+            outputs=[sync_status_indicator, processing_status, processing_status, session_state]
+        )
+        # Sync button to process files
+        sync_btn.click(
+            fn=process_files,
+            inputs=[files, session_state],
+            outputs=[sync_status_indicator, processing_status, processing_status, session_state],
+            show_progress=True
+        )
+        # Chat submission
+        msg.submit(
+            fn=chat_function,
+            inputs=[msg, chatbot, session_state, verification_output],
+            outputs=[chatbot, msg, session_state, verification_output]
+        )
+        submit_btn.click(
+            fn=chat_function,
+            inputs=[msg, chatbot, session_state, verification_output],
+            outputs=[chatbot, msg, session_state, verification_output]
+        )
+    demo.launch(server_name="127.0.0.1", server_port=5000, share=True)
+def _get_file_hashes(uploaded_files: List) -> frozenset:
+    """Generate SHA-256 hashes for uploaded files."""
+    hashes = set()
+    for file in uploaded_files:
+        # Handle both Gradio file objects and string paths
+        file_path = file.name if hasattr(file, 'name') else file
+        try:
+            with open(file_path, "rb") as f:
+                hashes.add(hashlib.sha256(f.read()).hexdigest())
+        except Exception as e:
+            logger.error(f"Error hashing file {file_path}: {e}")
+            continue
+    return frozenset(hashes)
+if __name__ == "__main__":
+    main()

config/__init.py__ ADDED Viewed

	@@ -0,0 +1,4 @@

+from .settings import settings
+from .constants import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
+__all__ = ["settings", "MAX_FILE_SIZE", "MAX_TOTAL_SIZE", "ALLOWED_TYPES"]

config/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (313 Bytes). View file

config/__pycache__/settings.cpython-310.pyc ADDED Viewed

Binary file (1.39 kB). View file

config/constants.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Maximum allowed size for a single file (50 MB)
+MAX_FILE_SIZE: int = 50 * 1024 * 1024
+# Maximum allowed total size for all uploaded files (200 MB)
+MAX_TOTAL_SIZE: int = 200 * 1024 * 1024
+# Allowed file types for upload
+ALLOWED_TYPES: list = [".txt", ".pdf", ".docx", ".md"]

config/settings.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from pydantic_settings import BaseSettings
+from .constants import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
+import os
+class Settings(BaseSettings):
+    # Ollama settings (local)
+    OLLAMA_BASE_URL: str = "http://localhost:11434"
+    OLLAMA_MODEL_RESEARCH: str = "llama3.2:3b"
+    OLLAMA_MODEL_VERIFICATION: str = "llama3.2:3b"
+    OLLAMA_MODEL_RELEVANCE: str = "llama3.2:3b"
+    # Embedding model (local)
+    EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"  # sentence-transformers model
+    # Optional settings with defaults
+    MAX_FILE_SIZE: int = MAX_FILE_SIZE
+    MAX_TOTAL_SIZE: int = MAX_TOTAL_SIZE
+    ALLOWED_TYPES: list = ALLOWED_TYPES
+    # Database settings
+    CHROMA_DB_PATH: str = "./chroma_db"
+    CHROMA_COLLECTION_NAME: str = "documents"
+    # Retrieval settings
+    VECTOR_SEARCH_K: int = 10
+    HYBRID_RETRIEVER_WEIGHTS: list = [0.4, 0.6]
+    # Logging settings
+    LOG_LEVEL: str = "INFO"
+    # Cache settings
+    CACHE_DIR: str = "document_cache"
+    CACHE_EXPIRE_DAYS: int = 7
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+settings = Settings()

document_processor/__init.py__ ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .file_handler import DocumentProcessor
2	+
3	+ __all__ = ["DocumentProcessor"]

document_processor/__pycache__/file_handler.cpython-310.pyc ADDED Viewed

Binary file (4.17 kB). View file

document_processor/file_handler.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import hashlib
+import pickle
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import List
+from docling.document_converter import DocumentConverter
+from langchain_text_splitters import MarkdownHeaderTextSplitter
+from config import constants
+from config.settings import settings
+from utils.logging import logger
+class DocumentProcessor:
+    def __init__(self):
+        self.headers = [("#", "Header 1"), ("##", "Header 2")]
+        self.cache_dir = Path(settings.CACHE_DIR)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+    def validate_files(self, files: List) -> None:
+        """Validate the total size of the uploaded files."""
+        total_size = 0
+        for f in files:
+            # Handle both Gradio file objects and string paths
+            file_path = f.name if hasattr(f, 'name') else f
+            try:
+                total_size += os.path.getsize(file_path)
+            except Exception as e:
+                logger.warning(f"Could not get size for {file_path}: {e}")
+                continue
+        if total_size > constants.MAX_TOTAL_SIZE:
+            raise ValueError(f"Total size exceeds {constants.MAX_TOTAL_SIZE//1024//1024}MB limit")
+    def process(self, files: List) -> List:
+        """Process files with caching for subsequent queries"""
+        self.validate_files(files)
+        all_chunks = []
+        seen_hashes = set()
+        for file in files:
+            try:
+                # Handle both Gradio file objects and string paths
+                file_path = file.name if hasattr(file, 'name') else file
+                # Generate content-based hash for caching
+                with open(file_path, "rb") as f:
+                    file_hash = self._generate_hash(f.read())
+                cache_path = self.cache_dir / f"{file_hash}.pkl"
+                if self._is_cache_valid(cache_path):
+                    logger.info(f"Loading from cache: {file_path}")
+                    chunks = self._load_from_cache(cache_path)
+                else:
+                    logger.info(f"Processing and caching: {file_path}")
+                    chunks = self._process_file(file_path)
+                    self._save_to_cache(chunks, cache_path)
+                # Deduplicate chunks across files
+                for chunk in chunks:
+                    chunk_hash = self._generate_hash(chunk.page_content.encode())
+                    if chunk_hash not in seen_hashes:
+                        all_chunks.append(chunk)
+                        seen_hashes.add(chunk_hash)
+            except Exception as e:
+                file_path_display = file.name if hasattr(file, 'name') else file
+                logger.error(f"Failed to process {file_path_display}: {str(e)}")
+                continue
+        logger.info(f"Total unique chunks: {len(all_chunks)}")
+        return all_chunks
+    def _process_file(self, file) -> List:
+        """Original processing logic with Docling"""
+        # Handle both Gradio file objects and string paths
+        file_path = file.name if hasattr(file, 'name') else file
+        if not file_path.endswith(('.pdf', '.docx', '.txt', '.md')):
+            logger.warning(f"Skipping unsupported file type: {file_path}")
+            return []
+        converter = DocumentConverter()
+        markdown = converter.convert(file_path).document.export_to_markdown()
+        splitter = MarkdownHeaderTextSplitter(self.headers)
+        return splitter.split_text(markdown)
+    def _generate_hash(self, content: bytes) -> str:
+        return hashlib.sha256(content).hexdigest()
+    def _save_to_cache(self, chunks: List, cache_path: Path):
+        with open(cache_path, "wb") as f:
+            pickle.dump({
+                "timestamp": datetime.now().timestamp(),
+                "chunks": chunks
+            }, f)
+    def _load_from_cache(self, cache_path: Path) -> List:
+        with open(cache_path, "rb") as f:
+            data = pickle.load(f)
+        return data["chunks"]
+    def _is_cache_valid(self, cache_path: Path) -> bool:
+        if not cache_path.exists():
+            return False
+        cache_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
+        return cache_age < timedelta(days=settings.CACHE_EXPIRE_DAYS)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[project]
+name = "docchat-adnan"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "chromadb>=1.3.0",
+    "docling>=2.59.0",
+    "gradio>=5.49.1",
+    "langchain>=1.0.3",
+    "langchain-community>=0.4.1",
+    "langchain-ollama>=1.0.0",
+    "langchain-text-splitters>=1.0.0",
+    "langgraph>=1.0.2",
+    "loguru>=0.7.3",
+    "rank-bm25>=0.2.2",
+    "sentence-transformers>=5.1.2",
+]

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,223 @@

+accelerate==1.11.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiosignal==1.4.0
+annotated-doc==0.0.3
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3 ; python_full_version < '3.14'
+anyio==4.11.0
+async-timeout==4.0.3 ; python_full_version < '3.11'
+attrs==25.4.0
+audioop-lts==0.2.2 ; python_full_version >= '3.13'
+backoff==2.2.1
+bcrypt==5.0.0
+beautifulsoup4==4.14.2
+brotli==1.1.0
+build==1.3.0
+cachetools==6.2.1
+certifi==2025.10.5
+charset-normalizer==3.4.4
+chromadb==1.3.0
+click==8.3.0
+colorama==0.4.6 ; (os_name != 'nt' and sys_platform == 'win32') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')
+coloredlogs==15.0.1
+colorlog==6.10.1 ; python_full_version < '3.14'
+dataclasses-json==0.6.7
+dill==0.4.0
+distro==1.9.0
+docling==2.59.0
+docling-core==2.50.0
+docling-ibm-models==3.10.2
+docling-parse==4.7.0
+durationpy==0.10
+et-xmlfile==2.0.0
+exceptiongroup==1.3.0 ; python_full_version < '3.11'
+faker==37.12.0
+fastapi==0.120.4
+ffmpy==0.6.4
+filelock==3.20.0
+filetype==1.2.0
+flatbuffers==25.9.23
+frozenlist==1.8.0
+fsspec==2025.10.0
+google-auth==2.42.1
+googleapis-common-protos==1.71.0
+gradio==5.49.1
+gradio-client==1.13.3
+greenlet==3.2.4 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
+groovy==0.1.2
+grpcio==1.76.0
+h11==0.16.0
+hf-xet==1.2.0 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface-hub==0.36.0
+humanfriendly==10.0
+idna==3.11
+importlib-metadata==8.7.0
+importlib-resources==6.5.2
+jinja2==3.1.6
+joblib==1.5.2
+jsonlines==4.0.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonref==1.1.0
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+kubernetes==34.1.0
+langchain==1.0.3
+langchain-classic==1.0.0
+langchain-community==0.4.1
+langchain-core==1.0.2
+langchain-ollama==1.0.0
+langchain-text-splitters==1.0.0
+langgraph==1.0.2
+langgraph-checkpoint==3.0.0
+langgraph-prebuilt==1.0.2
+langgraph-sdk==0.2.9
+langsmith==0.4.38
+latex2mathml==3.78.1
+loguru==0.7.3
+lxml==6.0.2
+markdown-it-py==4.0.0
+marko==2.2.1
+markupsafe==3.0.3
+marshmallow==3.26.1
+mdurl==0.1.2
+mmh3==5.2.0
+mpire==2.10.2
+mpmath==1.3.0
+multidict==6.7.0
+multiprocess==0.70.18
+mypy-extensions==1.1.0
+networkx==3.4.2 ; python_full_version < '3.11'
+networkx==3.5 ; python_full_version >= '3.11'
+numpy==2.2.6 ; python_full_version < '3.11'
+numpy==2.3.4 ; python_full_version >= '3.11'
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+oauthlib==3.3.1
+ocrmac==1.0.0 ; sys_platform == 'darwin'
+ollama==0.6.0
+omegaconf==2.3.0 ; python_full_version < '3.14'
+onnxruntime==1.23.2
+opencv-python==4.11.0.86 ; python_full_version < '3.14'
+openpyxl==3.1.5
+opentelemetry-api==1.38.0
+opentelemetry-exporter-otlp-proto-common==1.38.0
+opentelemetry-exporter-otlp-proto-grpc==1.38.0
+opentelemetry-proto==1.38.0
+opentelemetry-sdk==1.38.0
+opentelemetry-semantic-conventions==0.59b0
+orjson==3.11.4
+ormsgpack==1.11.0
+overrides==7.7.0
+packaging==25.0
+pandas==2.3.3
+pillow==11.3.0
+pluggy==1.6.0
+polyfactory==2.22.3
+posthog==5.4.0
+propcache==0.4.1
+protobuf==6.33.0
+psutil==7.1.2
+pyasn1==0.6.1
+pyasn1-modules==0.4.2
+pybase64==1.4.2
+pyclipper==1.3.0.post6 ; python_full_version < '3.14'
+pydantic==2.11.10
+pydantic-core==2.33.2
+pydantic-settings==2.11.0
+pydub==0.25.1
+pygments==2.19.2
+pylatexenc==2.10
+pyobjc-core==12.0 ; sys_platform == 'darwin'
+pyobjc-framework-cocoa==12.0 ; sys_platform == 'darwin'
+pyobjc-framework-coreml==12.0 ; sys_platform == 'darwin'
+pyobjc-framework-quartz==12.0 ; sys_platform == 'darwin'
+pyobjc-framework-vision==12.0 ; sys_platform == 'darwin'
+pypdfium2==4.30.0
+pypika==0.48.9
+pyproject-hooks==1.2.0
+pyreadline3==3.5.4 ; sys_platform == 'win32'
+python-dateutil==2.9.0.post0
+python-docx==1.2.0
+python-dotenv==1.2.1
+python-multipart==0.0.20
+python-pptx==1.0.2
+pytz==2025.2
+pywin32==311 ; sys_platform == 'win32'
+pyyaml==6.0.3
+rank-bm25==0.2.2
+rapidocr==3.4.2 ; python_full_version < '3.14'
+referencing==0.37.0
+regex==2025.10.23
+requests==2.32.5
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==14.2.0
+rpds-py==0.28.0
+rsa==4.9.1
+rtree==1.4.1
+ruff==0.14.3
+safehttpx==0.1.7
+safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.15.3 ; python_full_version < '3.11'
+scipy==1.16.3 ; python_full_version >= '3.11'
+semantic-version==2.10.0
+semchunk==2.2.2
+sentence-transformers==5.1.2
+setuptools==80.9.0 ; python_full_version >= '3.12'
+shapely==2.1.2 ; python_full_version < '3.14'
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.8
+sqlalchemy==2.0.44
+starlette==0.49.2
+sympy==1.14.0
+tabulate==0.9.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tokenizers==0.22.1
+tomli==2.3.0 ; python_full_version < '3.11'
+tomlkit==0.13.3
+torch==2.9.0
+torchvision==0.24.0
+tqdm==4.67.1
+transformers==4.57.1
+triton==3.5.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+typer==0.19.2
+typing-extensions==4.15.0
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+tzdata==2025.2
+urllib3==2.3.0
+uvicorn==0.38.0
+uvloop==0.22.1 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'
+watchfiles==1.1.1
+websocket-client==1.9.0
+websockets==15.0.1
+win32-setctime==1.2.0 ; sys_platform == 'win32'
+xlsxwriter==3.2.9
+xxhash==3.6.0
+yarl==1.22.0
+zipp==3.23.0
+zstandard==0.25.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,773 @@

+# This file was autogenerated by uv via the following command:
+#    uv export --no-hashes --format requirements-txt
+accelerate==1.11.0
+    # via
+    #   docling
+    #   docling-ibm-models
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.2
+    # via langchain-community
+aiosignal==1.4.0
+    # via aiohttp
+annotated-doc==0.0.3
+    # via fastapi
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3 ; python_full_version < '3.14'
+    # via omegaconf
+anyio==4.11.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+    #   watchfiles
+async-timeout==4.0.3 ; python_full_version < '3.11'
+    # via
+    #   aiohttp
+    #   langchain-classic
+attrs==25.4.0
+    # via
+    #   aiohttp
+    #   jsonlines
+    #   jsonschema
+    #   referencing
+audioop-lts==0.2.2 ; python_full_version >= '3.13'
+    # via gradio
+backoff==2.2.1
+    # via posthog
+bcrypt==5.0.0
+    # via chromadb
+beautifulsoup4==4.14.2
+    # via docling
+brotli==1.1.0
+    # via gradio
+build==1.3.0
+    # via chromadb
+cachetools==6.2.1
+    # via google-auth
+certifi==2025.10.5
+    # via
+    #   docling
+    #   httpcore
+    #   httpx
+    #   kubernetes
+    #   requests
+charset-normalizer==3.4.4
+    # via requests
+chromadb==1.3.0
+    # via docchat-adnan
+click==8.3.0
+    # via
+    #   ocrmac
+    #   typer
+    #   uvicorn
+colorama==0.4.6 ; (os_name != 'nt' and sys_platform == 'win32') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')
+    # via
+    #   build
+    #   click
+    #   colorlog
+    #   loguru
+    #   tqdm
+    #   uvicorn
+coloredlogs==15.0.1
+    # via onnxruntime
+colorlog==6.10.1 ; python_full_version < '3.14'
+    # via rapidocr
+dataclasses-json==0.6.7
+    # via langchain-community
+dill==0.4.0
+    # via multiprocess
+distro==1.9.0
+    # via posthog
+docling==2.59.0
+    # via docchat-adnan
+docling-core==2.50.0
+    # via
+    #   docling
+    #   docling-ibm-models
+    #   docling-parse
+docling-ibm-models==3.10.2
+    # via docling
+docling-parse==4.7.0
+    # via docling
+durationpy==0.10
+    # via kubernetes
+et-xmlfile==2.0.0
+    # via openpyxl
+exceptiongroup==1.3.0 ; python_full_version < '3.11'
+    # via anyio
+faker==37.12.0
+    # via polyfactory
+fastapi==0.120.4
+    # via gradio
+ffmpy==0.6.4
+    # via gradio
+filelock==3.20.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+filetype==1.2.0
+    # via docling
+flatbuffers==25.9.23
+    # via onnxruntime
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.10.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+google-auth==2.42.1
+    # via kubernetes
+googleapis-common-protos==1.71.0
+    # via opentelemetry-exporter-otlp-proto-grpc
+gradio==5.49.1
+    # via docchat-adnan
+gradio-client==1.13.3
+    # via gradio
+greenlet==3.2.4 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
+    # via sqlalchemy
+groovy==0.1.2
+    # via gradio
+grpcio==1.76.0
+    # via
+    #   chromadb
+    #   opentelemetry-exporter-otlp-proto-grpc
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.2.0 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httptools==0.7.1
+    # via uvicorn
+httpx==0.28.1
+    # via
+    #   chromadb
+    #   gradio
+    #   gradio-client
+    #   langgraph-sdk
+    #   langsmith
+    #   ollama
+    #   safehttpx
+httpx-sse==0.4.3
+    # via langchain-community
+huggingface-hub==0.36.0
+    # via
+    #   accelerate
+    #   docling
+    #   docling-ibm-models
+    #   gradio
+    #   gradio-client
+    #   sentence-transformers
+    #   tokenizers
+    #   transformers
+humanfriendly==10.0
+    # via coloredlogs
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-metadata==8.7.0
+    # via
+    #   build
+    #   opentelemetry-api
+importlib-resources==6.5.2
+    # via chromadb
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.2
+    # via scikit-learn
+jsonlines==4.0.0
+    # via docling-ibm-models
+jsonpatch==1.33
+    # via langchain-core
+jsonpointer==3.0.0
+    # via jsonpatch
+jsonref==1.1.0
+    # via docling-core
+jsonschema==4.25.1
+    # via
+    #   chromadb
+    #   docling-core
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+kubernetes==34.1.0
+    # via chromadb
+langchain==1.0.3
+    # via docchat-adnan
+langchain-classic==1.0.0
+    # via langchain-community
+langchain-community==0.4.1
+    # via docchat-adnan
+langchain-core==1.0.2
+    # via
+    #   langchain
+    #   langchain-classic
+    #   langchain-community
+    #   langchain-ollama
+    #   langchain-text-splitters
+    #   langgraph
+    #   langgraph-checkpoint
+    #   langgraph-prebuilt
+langchain-ollama==1.0.0
+    # via docchat-adnan
+langchain-text-splitters==1.0.0
+    # via
+    #   docchat-adnan
+    #   langchain-classic
+langgraph==1.0.2
+    # via
+    #   docchat-adnan
+    #   langchain
+langgraph-checkpoint==3.0.0
+    # via
+    #   langgraph
+    #   langgraph-prebuilt
+langgraph-prebuilt==1.0.2
+    # via langgraph
+langgraph-sdk==0.2.9
+    # via langgraph
+langsmith==0.4.38
+    # via
+    #   langchain-classic
+    #   langchain-community
+    #   langchain-core
+latex2mathml==3.78.1
+    # via docling-core
+loguru==0.7.3
+    # via docchat-adnan
+lxml==6.0.2
+    # via
+    #   docling
+    #   python-docx
+    #   python-pptx
+markdown-it-py==4.0.0
+    # via rich
+marko==2.2.1
+    # via docling
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+marshmallow==3.26.1
+    # via dataclasses-json
+mdurl==0.1.2
+    # via markdown-it-py
+mmh3==5.2.0
+    # via chromadb
+mpire==2.10.2
+    # via semchunk
+mpmath==1.3.0
+    # via sympy
+multidict==6.7.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.18
+    # via mpire
+mypy-extensions==1.1.0
+    # via typing-inspect
+networkx==3.4.2 ; python_full_version < '3.11'
+    # via torch
+networkx==3.5 ; python_full_version >= '3.11'
+    # via torch
+numpy==2.2.6 ; python_full_version < '3.11'
+    # via
+    #   accelerate
+    #   chromadb
+    #   docling-ibm-models
+    #   gradio
+    #   langchain-community
+    #   onnxruntime
+    #   opencv-python
+    #   pandas
+    #   rank-bm25
+    #   rapidocr
+    #   safetensors
+    #   scikit-learn
+    #   scipy
+    #   shapely
+    #   torchvision
+    #   transformers
+numpy==2.3.4 ; python_full_version >= '3.11'
+    # via
+    #   accelerate
+    #   chromadb
+    #   docling-ibm-models
+    #   gradio
+    #   langchain-community
+    #   onnxruntime
+    #   opencv-python
+    #   pandas
+    #   rank-bm25
+    #   rapidocr
+    #   safetensors
+    #   scikit-learn
+    #   scipy
+    #   shapely
+    #   torchvision
+    #   transformers
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+oauthlib==3.3.1
+    # via requests-oauthlib
+ocrmac==1.0.0 ; sys_platform == 'darwin'
+    # via docling
+ollama==0.6.0
+    # via langchain-ollama
+omegaconf==2.3.0 ; python_full_version < '3.14'
+    # via rapidocr
+onnxruntime==1.23.2
+    # via chromadb
+opencv-python==4.11.0.86 ; python_full_version < '3.14'
+    # via rapidocr
+openpyxl==3.1.5
+    # via docling
+opentelemetry-api==1.38.0
+    # via
+    #   chromadb
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp-proto-common==1.38.0
+    # via opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-exporter-otlp-proto-grpc==1.38.0
+    # via chromadb
+opentelemetry-proto==1.38.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-sdk==1.38.0
+    # via
+    #   chromadb
+    #   opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-semantic-conventions==0.59b0
+    # via opentelemetry-sdk
+orjson==3.11.4
+    # via
+    #   chromadb
+    #   gradio
+    #   langgraph-sdk
+    #   langsmith
+ormsgpack==1.11.0
+    # via langgraph-checkpoint
+overrides==7.7.0
+    # via chromadb
+packaging==25.0
+    # via
+    #   accelerate
+    #   build
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   langchain-core
+    #   langsmith
+    #   marshmallow
+    #   onnxruntime
+    #   transformers
+pandas==2.3.3
+    # via
+    #   docling
+    #   docling-core
+    #   gradio
+pillow==11.3.0
+    # via
+    #   docling
+    #   docling-core
+    #   docling-ibm-models
+    #   docling-parse
+    #   gradio
+    #   ocrmac
+    #   python-pptx
+    #   rapidocr
+    #   sentence-transformers
+    #   torchvision
+pluggy==1.6.0
+    # via docling
+polyfactory==2.22.3
+    # via docling
+posthog==5.4.0
+    # via chromadb
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.33.0
+    # via
+    #   googleapis-common-protos
+    #   onnxruntime
+    #   opentelemetry-proto
+psutil==7.1.2
+    # via accelerate
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pybase64==1.4.2
+    # via chromadb
+pyclipper==1.3.0.post6 ; python_full_version < '3.14'
+    # via rapidocr
+pydantic==2.11.10
+    # via
+    #   chromadb
+    #   docling
+    #   docling-core
+    #   docling-ibm-models
+    #   docling-parse
+    #   fastapi
+    #   gradio
+    #   langchain
+    #   langchain-classic
+    #   langchain-core
+    #   langgraph
+    #   langsmith
+    #   ollama
+    #   pydantic-settings
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-settings==2.11.0
+    # via
+    #   docling
+    #   langchain-community
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via
+    #   mpire
+    #   rich
+pylatexenc==2.10
+    # via docling
+pyobjc-core==12.0 ; sys_platform == 'darwin'
+    # via
+    #   pyobjc-framework-cocoa
+    #   pyobjc-framework-coreml
+    #   pyobjc-framework-quartz
+    #   pyobjc-framework-vision
+pyobjc-framework-cocoa==12.0 ; sys_platform == 'darwin'
+    # via
+    #   pyobjc-framework-coreml
+    #   pyobjc-framework-quartz
+    #   pyobjc-framework-vision
+pyobjc-framework-coreml==12.0 ; sys_platform == 'darwin'
+    # via pyobjc-framework-vision
+pyobjc-framework-quartz==12.0 ; sys_platform == 'darwin'
+    # via pyobjc-framework-vision
+pyobjc-framework-vision==12.0 ; sys_platform == 'darwin'
+    # via ocrmac
+pypdfium2==4.30.0
+    # via docling
+pypika==0.48.9
+    # via chromadb
+pyproject-hooks==1.2.0
+    # via build
+pyreadline3==3.5.4 ; sys_platform == 'win32'
+    # via humanfriendly
+python-dateutil==2.9.0.post0
+    # via
+    #   kubernetes
+    #   pandas
+    #   posthog
+python-docx==1.2.0
+    # via docling
+python-dotenv==1.2.1
+    # via
+    #   pydantic-settings
+    #   uvicorn
+python-multipart==0.0.20
+    # via gradio
+python-pptx==1.0.2
+    # via docling
+pytz==2025.2
+    # via pandas
+pywin32==311 ; sys_platform == 'win32'
+    # via
+    #   docling-parse
+    #   mpire
+pyyaml==6.0.3
+    # via
+    #   accelerate
+    #   chromadb
+    #   docling-core
+    #   gradio
+    #   huggingface-hub
+    #   kubernetes
+    #   langchain-classic
+    #   langchain-community
+    #   langchain-core
+    #   omegaconf
+    #   rapidocr
+    #   transformers
+    #   uvicorn
+rank-bm25==0.2.2
+    # via docchat-adnan
+rapidocr==3.4.2 ; python_full_version < '3.14'
+    # via docling
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2025.10.23
+    # via transformers
+requests==2.32.5
+    # via
+    #   docling
+    #   huggingface-hub
+    #   kubernetes
+    #   langchain-classic
+    #   langchain-community
+    #   langsmith
+    #   posthog
+    #   rapidocr
+    #   requests-oauthlib
+    #   requests-toolbelt
+    #   transformers
+requests-oauthlib==2.0.0
+    # via kubernetes
+requests-toolbelt==1.0.0
+    # via langsmith
+rich==14.2.0
+    # via
+    #   chromadb
+    #   typer
+rpds-py==0.28.0
+    # via
+    #   jsonschema
+    #   referencing
+rsa==4.9.1
+    # via google-auth
+rtree==1.4.1
+    # via
+    #   docling
+    #   docling-ibm-models
+ruff==0.14.3
+    # via gradio
+safehttpx==0.1.7
+    # via gradio
+safetensors==0.6.2
+    # via
+    #   accelerate
+    #   docling-ibm-models
+    #   transformers
+scikit-learn==1.7.2
+    # via sentence-transformers
+scipy==1.15.3 ; python_full_version < '3.11'
+    # via
+    #   docling
+    #   scikit-learn
+    #   sentence-transformers
+scipy==1.16.3 ; python_full_version >= '3.11'
+    # via
+    #   docling
+    #   scikit-learn
+    #   sentence-transformers
+semantic-version==2.10.0
+    # via gradio
+semchunk==2.2.2
+    # via docling-core
+sentence-transformers==5.1.2
+    # via docchat-adnan
+setuptools==80.9.0 ; python_full_version >= '3.12'
+    # via torch
+shapely==2.1.2 ; python_full_version < '3.14'
+    # via rapidocr
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   kubernetes
+    #   posthog
+    #   python-dateutil
+    #   rapidocr
+sniffio==1.3.1
+    # via anyio
+soupsieve==2.8
+    # via beautifulsoup4
+sqlalchemy==2.0.44
+    # via
+    #   langchain-classic
+    #   langchain-community
+starlette==0.49.2
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.14.0
+    # via
+    #   onnxruntime
+    #   torch
+tabulate==0.9.0
+    # via
+    #   docling-core
+    #   docling-parse
+tenacity==9.1.2
+    # via
+    #   chromadb
+    #   langchain-community
+    #   langchain-core
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.22.1
+    # via
+    #   chromadb
+    #   transformers
+tomli==2.3.0 ; python_full_version < '3.11'
+    # via build
+tomlkit==0.13.3
+    # via gradio
+torch==2.9.0
+    # via
+    #   accelerate
+    #   docling-ibm-models
+    #   safetensors
+    #   sentence-transformers
+    #   torchvision
+torchvision==0.24.0
+    # via docling-ibm-models
+tqdm==4.67.1
+    # via
+    #   chromadb
+    #   docling
+    #   docling-ibm-models
+    #   huggingface-hub
+    #   mpire
+    #   rapidocr
+    #   semchunk
+    #   sentence-transformers
+    #   transformers
+transformers==4.57.1
+    # via
+    #   docling-core
+    #   docling-ibm-models
+    #   sentence-transformers
+triton==3.5.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+typer==0.19.2
+    # via
+    #   chromadb
+    #   docling
+    #   docling-core
+    #   gradio
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anyio
+    #   beautifulsoup4
+    #   chromadb
+    #   docling-core
+    #   exceptiongroup
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   grpcio
+    #   huggingface-hub
+    #   langchain-core
+    #   multidict
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   polyfactory
+    #   pydantic
+    #   pydantic-core
+    #   python-docx
+    #   python-pptx
+    #   referencing
+    #   sentence-transformers
+    #   sqlalchemy
+    #   starlette
+    #   torch
+    #   typer
+    #   typing-inspect
+    #   typing-inspection
+    #   uvicorn
+typing-inspect==0.9.0
+    # via dataclasses-json
+typing-inspection==0.4.2
+    # via
+    #   pydantic
+    #   pydantic-settings
+tzdata==2025.2
+    # via
+    #   faker
+    #   pandas
+urllib3==2.3.0
+    # via
+    #   kubernetes
+    #   requests
+uvicorn==0.38.0
+    # via
+    #   chromadb
+    #   gradio
+uvloop==0.22.1 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'
+    # via uvicorn
+watchfiles==1.1.1
+    # via uvicorn
+websocket-client==1.9.0
+    # via kubernetes
+websockets==15.0.1
+    # via
+    #   gradio-client
+    #   uvicorn
+win32-setctime==1.2.0 ; sys_platform == 'win32'
+    # via loguru
+xlsxwriter==3.2.9
+    # via python-pptx
+xxhash==3.6.0
+    # via langgraph
+yarl==1.22.0
+    # via aiohttp
+zipp==3.23.0
+    # via importlib-metadata
+zstandard==0.25.0
+    # via langsmith

retriever/__init.py__ ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .builder import RetrieverBuilder
2	+
3	+ __all__ = ["RetrieverBuilder"]

retriever/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (3.26 kB). View file

retriever/builder.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from langchain_community.vectorstores import Chroma
+from langchain_community.retrievers import BM25Retriever
+# from langchain.retrievers import EnsembleRetriever
+from langchain_classic.retrievers.ensemble import EnsembleRetriever
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from config.settings import settings
+import logging
+import os
+import hashlib
+logger = logging.getLogger(__name__)
+class RetrieverBuilder:
+    def __init__(self):
+        """Initialize the retriever builder with local embeddings."""
+        # Use sentence-transformers for local embeddings
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name=settings.EMBEDDING_MODEL,
+            model_kwargs={'device': 'cpu'},  # Use 'cuda' if you have GPU
+            encode_kwargs={'normalize_embeddings': True}
+        )
+        logger.info(f"Initialized local embeddings: {settings.EMBEDDING_MODEL}")
+    def build_hybrid_retriever(self, docs):
+        """Build a hybrid retriever using BM25 and vector-based retrieval.
+        Reuses existing ChromaDB if available and only adds new documents.
+        """
+        try:
+            # Check if ChromaDB already exists
+            chroma_db_file = os.path.join(settings.CHROMA_DB_PATH, "chroma.sqlite3")
+            chroma_exists = os.path.exists(settings.CHROMA_DB_PATH) and os.path.exists(chroma_db_file)
+            if chroma_exists:
+                logger.info(f"Loading existing ChromaDB from {settings.CHROMA_DB_PATH}")
+                try:
+                    # Load existing vector store
+                    vector_store = Chroma(
+                        persist_directory=settings.CHROMA_DB_PATH,
+                        embedding_function=self.embeddings,
+                        collection_name=settings.CHROMA_COLLECTION_NAME
+                    )
+                    # Get existing document IDs to check for new documents
+                    try:
+                        existing_data = vector_store.get()
+                        existing_ids = set(existing_data.get('ids', [])) if existing_data else set()
+                        logger.info(f"Found {len(existing_ids)} existing documents in ChromaDB")
+                    except Exception as e:
+                        logger.warning(f"Could not retrieve existing IDs from ChromaDB: {e}. Treating as empty.")
+                        existing_ids = set()
+                    # Filter out documents that already exist (based on content hash)
+                    new_docs = []
+                    doc_ids = []
+                    for doc in docs:
+                        # Generate a simple ID based on content hash
+                        doc_id = hashlib.md5(doc.page_content.encode()).hexdigest()
+                        if doc_id not in existing_ids:
+                            new_docs.append(doc)
+                            doc_ids.append(doc_id)
+                    if new_docs:
+                        logger.info(f"Adding {len(new_docs)} new documents to ChromaDB")
+                        vector_store.add_documents(new_docs, ids=doc_ids)
+                        vector_store.persist()
+                    else:
+                        logger.info("No new documents to add. Using existing ChromaDB.")
+                except Exception as e:
+                    logger.warning(f"Failed to load existing ChromaDB: {e}. Creating new one.")
+                    # Fall back to creating new DB
+                    vector_store = Chroma.from_documents(
+                        documents=docs,
+                        embedding=self.embeddings,
+                        persist_directory=settings.CHROMA_DB_PATH,
+                        collection_name=settings.CHROMA_COLLECTION_NAME
+                    )
+            else:
+                logger.info(f"Creating new ChromaDB at {settings.CHROMA_DB_PATH}")
+                # Create new Chroma vector store
+                vector_store = Chroma.from_documents(
+                    documents=docs,
+                    embedding=self.embeddings,
+                    persist_directory=settings.CHROMA_DB_PATH,
+                    collection_name=settings.CHROMA_COLLECTION_NAME
+                )
+                logger.info("Vector store created successfully.")
+            # Create BM25 retriever
+            bm25 = BM25Retriever.from_documents(docs)
+            logger.info("BM25 retriever created successfully.")
+            # Create vector-based retriever
+            vector_retriever = vector_store.as_retriever(search_kwargs={"k": settings.VECTOR_SEARCH_K})
+            logger.info("Vector retriever created successfully.")
+            # Combine retrievers into a hybrid retriever
+            hybrid_retriever = EnsembleRetriever(
+                retrievers=[bm25, vector_retriever],
+                weights=settings.HYBRID_RETRIEVER_WEIGHTS
+            )
+            logger.info("Hybrid retriever created successfully.")
+            return hybrid_retriever
+        except Exception as e:
+            logger.error(f"Failed to build hybrid retriever: {e}")
+            raise

test/ocr_test.pdf ADDED Viewed

Binary file (93.5 kB). View file

test/sample.png ADDED Viewed

Git LFS Details

SHA256: 6adad9c714bfa9c44eee0a47c4b0eeeaf5592ec50f515cdff977530569e39db5
Pointer size: 131 Bytes
Size of remote file: 308 kB

test/test1.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from docling.document_converter import DocumentConverter
+from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+import os
+### 🔹 Docling PDF Parsing
+def parse_with_docling(pdf_path):
+    """
+    Parses a PDF using Docling, extracts markdown content,
+    and prints the full extracted content.
+    """
+    try:
+        # Ensure file exists
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"File not found: {pdf_path}")
+        # Initialize Docling Converter
+        converter = DocumentConverter()
+        markdown_document = converter.convert(pdf_path).document.export_to_markdown()
+        # Define headers to split on (modify as needed)
+        headers_to_split_on = [
+            ("#", "Header 1"),
+            ("##", "Header 2"),
+            ("###", "Header 3"),
+        ]
+        # Initialize Markdown Splitter
+        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+        docs_list = markdown_splitter.split_text(markdown_document)
+        # Print full extracted sections
+        print("\n✅ Full Extracted Content (Docling):")
+        for idx, doc in enumerate(docs_list):
+            print(f"\n🔹 Section {idx + 1}:\n{doc}\n" + "-"*80)
+        return docs_list
+    except Exception as e:
+        print(f"\n❌ Error during Docling processing: {e}")
+        return []
+### 🔹 LangChain PDF Parsing
+def parse_with_langchain(pdf_path):
+    """
+    Parses a PDF using LangChain's PyPDFLoader and prints the full extracted text.
+    """
+    try:
+        # Ensure file exists
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"File not found: {pdf_path}")
+        # Load PDF using PyPDFLoader
+        loader = PyPDFLoader(pdf_path)
+        pages = loader.load()
+        # Extract text from all pages
+        text = "\n\n".join([page.page_content for page in pages])
+        # Print full extracted content
+        print("\n✅ Full Extracted Content (LangChain):\n")
+        print(text)
+        print("\n" + "="*100)
+        return text
+    except Exception as e:
+        print(f"\n❌ Error during LangChain processing: {e}")
+        return ""
+### 🔹 Main Execution
+def main():
+    ocr_path = "test/ocr_test.pdf"
+    scanned_pdf_path = "test/sample.png"
+    print("\n🔍 Running Docling Extraction for OCR...")
+    docling_docs = parse_with_docling(ocr_path)
+    print("\n🔍 Running LangChain Extraction for OCR...")
+    langchain_text = parse_with_langchain(ocr_path)
+    print("\n🔍 Running Docling Extraction for scanned PDF...")
+    docling_docs = parse_with_docling(scanned_pdf_path)
+    print("\n🔍 Running LangChain Extraction for scanned PDF...")
+    langchain_text = parse_with_langchain(scanned_pdf_path)
+if __name__ == "__main__":
+    main()

utils/__init.py__ ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .logging import logger
2	+
3	+ __all__ = ["logger"]

utils/__pycache__/logging.cpython-310.pyc ADDED Viewed

Binary file (320 Bytes). View file

utils/logging.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from loguru import logger
+logger.add(
+    "app.log",
+    rotation="10 MB",
+    retention="30 days",
+    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
+)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff