abrar-adnan commited on
Commit
6acfeaf
ยท
verified ยท
1 Parent(s): 008a215

Initial commit

Browse files
.env ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ollama Configuration (local)
2
+ OLLAMA_BASE_URL=http://localhost:11434
3
+ OLLAMA_MODEL_RESEARCH=llama3.2:3b
4
+ OLLAMA_MODEL_VERIFICATION=llama3.2:3b
5
+ OLLAMA_MODEL_RELEVANCE=llama3.2:3b
6
+
7
+ # Embedding Model (local)
8
+ EMBEDDING_MODEL=all-MiniLM-L6-v2
9
+
10
+ # Optional settings
11
+ LOG_LEVEL=INFO
12
+ CHROMA_DB_PATH=./chroma_db
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ test/sample.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # ChromaDB files
13
+ chroma_db/
14
+ document_cache/
15
+ chroma_db/chroma.sqlite3
16
+ app.log
17
+ examples/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10.18
LICENSE.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Non-Commercial License
2
+
3
+ Copyright (c) [2025] [Hailey Thao Quach]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, merge, publish, and distribute the Software, subject to the following conditions:
6
+
7
+ 1. **Non-Commercial Use Only**
8
+ This software is licensed for non-commercial purposes only. Commercial use, including but not limited to, selling, licensing, incorporating into for-profit products or services, or otherwise using the software for financial gain, is strictly prohibited without prior written permission from the copyright holder.
9
+
10
+ 2. **Attribution**
11
+ Any use of this software must include proper attribution to the original author(s) by retaining this license text in all copies or substantial portions of the software.
12
+
13
+ 3. **No Warranty**
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15
+
16
+ For permissions beyond the scope of this license, please contact [hailey@haileyq.com].
agents/__init.py__ ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .research_agent import ResearchAgent
2
+ from .verification_agent import VerificationAgent
3
+ from .workflow import AgentWorkflow
4
+
5
+ __all__ = ["ResearchAgent", "VerificationAgent", "AgentWorkflow"]
agents/__pycache__/relevance_checker.cpython-310.pyc ADDED
Binary file (3.41 kB). View file
 
agents/__pycache__/research_agent.cpython-310.pyc ADDED
Binary file (3.26 kB). View file
 
agents/__pycache__/verification_agent.cpython-310.pyc ADDED
Binary file (9.35 kB). View file
 
agents/__pycache__/workflow.cpython-310.pyc ADDED
Binary file (4.55 kB). View file
 
agents/relevance_checker.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_ollama import ChatOllama
2
+ from config.settings import settings
3
+ import re
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class RelevanceChecker:
9
+ def __init__(self):
10
+ # Initialize the local Ollama LLM
11
+ print("Initializing RelevanceChecker with Ollama (local)...")
12
+ self.llm = ChatOllama(
13
+ base_url=settings.OLLAMA_BASE_URL,
14
+ model=settings.OLLAMA_MODEL_RELEVANCE,
15
+ temperature=0,
16
+ num_predict=10,
17
+ )
18
+ print("Ollama LLM initialized successfully.")
19
+
20
+ def check(self, question: str, retriever, k=3) -> str:
21
+ """
22
+ 1. Retrieve the top-k document chunks from the global retriever.
23
+ 2. Combine them into a single text string.
24
+ 3. Pass that text + question to the LLM for classification.
25
+
26
+ Returns: "CAN_ANSWER", "PARTIAL", or "NO_MATCH".
27
+ """
28
+
29
+ logger.debug(f"RelevanceChecker.check called with question='{question}' and k={k}")
30
+
31
+ # Retrieve doc chunks from the ensemble retriever
32
+ top_docs = retriever.invoke(question)
33
+ if not top_docs:
34
+ logger.debug("No documents returned from retriever.invoke(). Classifying as NO_MATCH.")
35
+ return "NO_MATCH"
36
+
37
+ # Combine the top k chunk texts into one string
38
+ document_content = "\n\n".join(doc.page_content for doc in top_docs[:k])
39
+
40
+ # Create a prompt for the LLM to classify relevance
41
+ prompt = f"""
42
+ You are an AI relevance checker between a user's question and provided document content.
43
+
44
+ **Instructions:**
45
+ - Classify how well the document content addresses the user's question.
46
+ - Respond with only one of the following labels: CAN_ANSWER, PARTIAL, NO_MATCH.
47
+ - Do not include any additional text or explanation.
48
+
49
+ **Labels:**
50
+ 1) "CAN_ANSWER": The passages contain enough explicit information to fully answer the question.
51
+ 2) "PARTIAL": The passages mention or discuss the question's topic but do not provide all the details needed for a complete answer.
52
+ 3) "NO_MATCH": The passages do not discuss or mention the question's topic at all.
53
+
54
+ **Important:** If the passages mention or reference the topic or timeframe of the question in any way, even if incomplete, respond with "PARTIAL" instead of "NO_MATCH".
55
+
56
+ **Question:** {question}
57
+ **Passages:** {document_content}
58
+
59
+ **Respond ONLY with one of the following labels: CAN_ANSWER, PARTIAL, NO_MATCH**
60
+ """
61
+
62
+ # Call the LLM
63
+ try:
64
+ response = self.llm.invoke(prompt)
65
+
66
+ # Extract content from LangChain message
67
+ if hasattr(response, 'content'):
68
+ llm_response = response.content.strip().upper()
69
+ else:
70
+ llm_response = str(response).strip().upper()
71
+
72
+ except Exception as e:
73
+ logger.error(f"Error during model inference: {e}")
74
+ return "NO_MATCH"
75
+
76
+ logger.debug(f"LLM response: {llm_response}")
77
+
78
+ # Validate the response
79
+ valid_labels = {"CAN_ANSWER", "PARTIAL", "NO_MATCH"}
80
+ if llm_response not in valid_labels:
81
+ logger.debug("LLM did not respond with a valid label. Forcing 'NO_MATCH'.")
82
+ classification = "NO_MATCH"
83
+ else:
84
+ logger.debug(f"Classification recognized as '{llm_response}'.")
85
+ classification = llm_response
86
+
87
+ print(f"Checker response: {classification}")
88
+ return classification
agents/research_agent.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_ollama import OllamaLLM
2
+ from langchain_ollama import ChatOllama
3
+ from typing import Dict, List
4
+ from langchain_core.documents.base import Document
5
+ from config.settings import settings
6
+
7
+ class ResearchAgent:
8
+ def __init__(self):
9
+ """
10
+ Initialize the research agent with local Ollama LLM.
11
+ """
12
+ print("Initializing ResearchAgent with Ollama (local)...")
13
+ self.llm = ChatOllama(
14
+ base_url=settings.OLLAMA_BASE_URL,
15
+ model=settings.OLLAMA_MODEL_RESEARCH,
16
+ temperature=0.3,
17
+ num_predict=300, # max_tokens equivalent
18
+ )
19
+ print("Ollama LLM initialized successfully.")
20
+
21
+ def sanitize_response(self, response_text: str) -> str:
22
+ """
23
+ Sanitize the LLM's response by stripping unnecessary whitespace.
24
+ """
25
+ return response_text.strip()
26
+
27
+ def generate_prompt(self, question: str, context: str) -> str:
28
+ """
29
+ Generate a structured prompt for the LLM to generate a precise and factual answer.
30
+ """
31
+ prompt = f"""
32
+ You are an AI assistant designed to provide precise and factual answers based on the given context.
33
+
34
+ **Instructions:**
35
+ - Answer the following question using only the provided context.
36
+ - Be clear, concise, and factual.
37
+ - Return as much information as you can get from the context.
38
+
39
+ **Question:** {question}
40
+ **Context:**
41
+ {context}
42
+
43
+ **Provide your answer below:**
44
+ """
45
+ return prompt
46
+
47
+ def generate(self, question: str, documents: List[Document]) -> Dict:
48
+ """
49
+ Generate an initial answer using the provided documents.
50
+ """
51
+ print(f"ResearchAgent.generate called with question='{question}' and {len(documents)} documents.")
52
+
53
+ # Combine the top document contents into one string
54
+ context = "\n\n".join([doc.page_content for doc in documents])
55
+ print(f"Combined context length: {len(context)} characters.")
56
+
57
+ # Create a prompt for the LLM
58
+ prompt = self.generate_prompt(question, context)
59
+ print("Prompt created for the LLM.")
60
+
61
+ # Call the LLM to generate the answer
62
+ try:
63
+ print("Sending prompt to Ollama...")
64
+ response = self.llm.invoke(prompt)
65
+ print("LLM response received.")
66
+
67
+ # Extract content from LangChain message
68
+ if hasattr(response, 'content'):
69
+ llm_response = response.content
70
+ else:
71
+ llm_response = str(response)
72
+
73
+ except Exception as e:
74
+ print(f"Error during model inference: {e}")
75
+ raise RuntimeError("Failed to generate answer due to a model error.") from e
76
+
77
+ # Sanitize the response
78
+ draft_answer = self.sanitize_response(llm_response) if llm_response else "I cannot answer this question based on the provided documents."
79
+
80
+ print(f"Generated answer: {draft_answer}")
81
+
82
+ return {
83
+ "draft_answer": draft_answer,
84
+ "context_used": context
85
+ }
agents/verification_agent.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_ollama import ChatOllama
2
+ from typing import Dict, List
3
+ from langchain_core.documents.base import Document
4
+ from config.settings import settings
5
+
6
+ class VerificationAgent:
7
+ def __init__(self):
8
+ """
9
+ Initialize the verification agent with local Ollama LLM.
10
+ """
11
+ print("Initializing VerificationAgent with Ollama (local)...")
12
+ self.llm = ChatOllama(
13
+ base_url=settings.OLLAMA_BASE_URL,
14
+ model=settings.OLLAMA_MODEL_VERIFICATION,
15
+ temperature=0.0,
16
+ num_predict=200,
17
+ )
18
+ print("Ollama LLM initialized successfully.")
19
+
20
+ def sanitize_response(self, response_text: str) -> str:
21
+ """
22
+ Sanitize the LLM's response by stripping unnecessary whitespace.
23
+ """
24
+ return response_text.strip()
25
+
26
+ def generate_prompt(self, answer: str, context: str) -> str:
27
+ """
28
+ Generate a structured prompt for the LLM to verify the answer against the context.
29
+ """
30
+ prompt = f"""You are a strict verification agent. Your task is to verify if an answer is supported by the provided context.
31
+
32
+ CRITICAL RULES:
33
+ 1. ONLY use information from the context provided below. Do NOT use any external knowledge or assumptions.
34
+ 2. If a claim in the answer is NOT explicitly or implicitly supported by the context, mark it as unsupported.
35
+ 3. If the answer contradicts information in the context, mark it as a contradiction.
36
+ 4. If you cannot verify a claim using ONLY the context, mark it as unsupported.
37
+ 5. Be strict - do not assume or infer beyond what is clearly stated in the context.
38
+ 6. Respond EXACTLY in the format specified below - no additional text, explanations, or formatting.
39
+
40
+ **VERIFICATION FORMAT (follow exactly):**
41
+ Supported: YES
42
+ Unsupported Claims: []
43
+ Contradictions: []
44
+ Relevant: YES
45
+ Additional Details: None
46
+
47
+ OR if unsupported/contradictions found:
48
+ Supported: NO
49
+ Unsupported Claims: [list each unsupported claim exactly as it appears in the answer]
50
+ Contradictions: [list each contradiction exactly as it appears]
51
+ Relevant: YES or NO
52
+ Additional Details: [brief explanation of why claims are unsupported or contradicted]
53
+
54
+ **Answer to verify:**
55
+ {answer}
56
+
57
+ **Context (use ONLY this for verification):**
58
+ {context}
59
+
60
+ **Your verification (respond ONLY with the format above):**
61
+ """
62
+ return prompt
63
+
64
+ def parse_verification_response(self, response_text: str) -> Dict:
65
+ """
66
+ Parse the LLM's verification response into a structured dictionary.
67
+ """
68
+ try:
69
+ # Normalize the response - remove markdown formatting, extra whitespace
70
+ response_text = response_text.strip()
71
+ # Remove any markdown code blocks if present
72
+ if response_text.startswith('```'):
73
+ lines = response_text.split('\n')
74
+ response_text = '\n'.join(lines[1:-1]) if len(lines) > 2 else response_text
75
+
76
+ print(f"[DEBUG] Parsing verification response (first 500 chars): {response_text[:500]}")
77
+
78
+ verification = {}
79
+ lines = response_text.split('\n')
80
+
81
+ for line in lines:
82
+ line = line.strip()
83
+ if not line or not ':' in line:
84
+ continue
85
+
86
+ # Split on first colon only
87
+ parts = line.split(':', 1)
88
+ if len(parts) != 2:
89
+ continue
90
+
91
+ key = parts[0].strip()
92
+ value = parts[1].strip()
93
+
94
+ # Normalize key names (case-insensitive matching)
95
+ key_lower = key.lower()
96
+ if 'supported' in key_lower:
97
+ # Extract YES/NO, handle variations
98
+ value_upper = value.upper()
99
+ print(f"[DEBUG] Found 'Supported' key with value: '{value}' (upper: '{value_upper}')")
100
+ if 'YES' in value_upper or 'TRUE' in value_upper or 'Y' == value_upper.strip():
101
+ verification["Supported"] = "YES"
102
+ print(f"[DEBUG] Set Supported to YES")
103
+ elif 'NO' in value_upper or 'FALSE' in value_upper or 'N' == value_upper.strip():
104
+ verification["Supported"] = "NO"
105
+ print(f"[DEBUG] Set Supported to NO")
106
+ else:
107
+ # If value is empty or unclear, check if there are unsupported claims/contradictions
108
+ # If no issues found later, default to YES; otherwise NO
109
+ print(f"[DEBUG] Supported value unclear: '{value}', will decide based on claims/contradictions")
110
+ verification["Supported"] = None # Mark as undecided
111
+
112
+ elif 'unsupported' in key_lower:
113
+ # Handle list parsing
114
+ items = []
115
+ value = value.strip()
116
+ if value.lower() in ['none', 'n/a', '[]', '']:
117
+ items = []
118
+ elif value.startswith('[') and value.endswith(']'):
119
+ # Parse list items
120
+ list_content = value[1:-1].strip()
121
+ if list_content:
122
+ items = [item.strip().strip('"').strip("'").strip()
123
+ for item in list_content.split(',')
124
+ if item.strip()]
125
+ else:
126
+ # Single item or comma-separated without brackets
127
+ items = [item.strip().strip('"').strip("'")
128
+ for item in value.split(',')
129
+ if item.strip() and item.strip().lower() not in ['none', 'n/a']]
130
+ verification["Unsupported Claims"] = items
131
+
132
+ elif 'contradiction' in key_lower:
133
+ # Handle list parsing (same logic as unsupported)
134
+ items = []
135
+ value = value.strip()
136
+ if value.lower() in ['none', 'n/a', '[]', '']:
137
+ items = []
138
+ elif value.startswith('[') and value.endswith(']'):
139
+ list_content = value[1:-1].strip()
140
+ if list_content:
141
+ items = [item.strip().strip('"').strip("'").strip()
142
+ for item in list_content.split(',')
143
+ if item.strip()]
144
+ else:
145
+ items = [item.strip().strip('"').strip("'")
146
+ for item in value.split(',')
147
+ if item.strip() and item.strip().lower() not in ['none', 'n/a']]
148
+ verification["Contradictions"] = items
149
+
150
+ elif 'relevant' in key_lower:
151
+ value_upper = value.upper()
152
+ if 'YES' in value_upper or 'TRUE' in value_upper:
153
+ verification["Relevant"] = "YES"
154
+ elif 'NO' in value_upper or 'FALSE' in value_upper:
155
+ verification["Relevant"] = "NO"
156
+ else:
157
+ verification["Relevant"] = "YES" # Default to YES if unclear
158
+
159
+ elif 'additional' in key_lower or 'detail' in key_lower:
160
+ if value.lower() in ['none', 'n/a', '']:
161
+ verification["Additional Details"] = ""
162
+ else:
163
+ verification["Additional Details"] = value
164
+
165
+ # Ensure all required keys are present with defaults
166
+ if "Supported" not in verification or verification.get("Supported") is None:
167
+ # If undecided, check if there are unsupported claims or contradictions
168
+ unsupported_claims = verification.get("Unsupported Claims", [])
169
+ contradictions = verification.get("Contradictions", [])
170
+ if not unsupported_claims and not contradictions:
171
+ verification["Supported"] = "YES" # No issues found, default to YES
172
+ print(f"[DEBUG] Supported was missing/undecided, but no claims/contradictions found, defaulting to YES")
173
+ else:
174
+ verification["Supported"] = "NO" # Issues found, default to NO
175
+ print(f"[DEBUG] Supported was missing/undecided, but found {len(unsupported_claims)} unsupported claims and {len(contradictions)} contradictions, defaulting to NO")
176
+ if "Unsupported Claims" not in verification:
177
+ verification["Unsupported Claims"] = []
178
+ if "Contradictions" not in verification:
179
+ verification["Contradictions"] = []
180
+ if "Relevant" not in verification:
181
+ verification["Relevant"] = "YES"
182
+ if "Additional Details" not in verification:
183
+ verification["Additional Details"] = ""
184
+
185
+ print(f"[DEBUG] Final parsed verification: Supported={verification.get('Supported')}, Unsupported Claims={len(verification.get('Unsupported Claims', []))}, Contradictions={len(verification.get('Contradictions', []))}")
186
+ return verification
187
+ except Exception as e:
188
+ print(f"Error parsing verification response: {e}")
189
+ print(f"Response text was: {response_text}")
190
+ # Return a safe default
191
+ return {
192
+ "Supported": "NO",
193
+ "Unsupported Claims": [],
194
+ "Contradictions": [],
195
+ "Relevant": "NO",
196
+ "Additional Details": f"Parsing error: {str(e)}"
197
+ }
198
+
199
+ def format_verification_report(self, verification: Dict) -> str:
200
+ """
201
+ Format the verification report dictionary into a readable markdown-formatted report.
202
+ """
203
+ supported = verification.get("Supported", "NO")
204
+ unsupported_claims = verification.get("Unsupported Claims", [])
205
+ contradictions = verification.get("Contradictions", [])
206
+ relevant = verification.get("Relevant", "NO")
207
+ additional_details = verification.get("Additional Details", "")
208
+
209
+ # Use markdown formatting for better display
210
+ report = f"### Verification Report\n\n"
211
+
212
+ # Add status indicators
213
+ supported_icon = "โœ…" if supported == "YES" else "โŒ"
214
+ report += f"**Supported:** {supported_icon} {supported}\n\n"
215
+
216
+ if unsupported_claims:
217
+ report += f"**โš ๏ธ Unsupported Claims:**\n"
218
+ for claim in unsupported_claims:
219
+ report += f"- {claim}\n"
220
+ report += "\n"
221
+ else:
222
+ report += f"**Unsupported Claims:** None\n\n"
223
+
224
+ if contradictions:
225
+ report += f"**๐Ÿ”ด Contradictions:**\n"
226
+ for contradiction in contradictions:
227
+ report += f"- {contradiction}\n"
228
+ report += "\n"
229
+ else:
230
+ report += f"**Contradictions:** None\n\n"
231
+
232
+ relevant_icon = "โœ…" if relevant == "YES" else "โŒ"
233
+ report += f"**Relevant:** {relevant_icon} {relevant}\n\n"
234
+
235
+ if additional_details and additional_details.lower() not in ['none', 'n/a', '']:
236
+ report += f"**Additional Details:**\n{additional_details}\n"
237
+ else:
238
+ report += f"**Additional Details:** None\n"
239
+
240
+ return report
241
+
242
+ def generate_out_of_context_report(self) -> str:
243
+ """
244
+ Generate a verification report for questions that are out of context.
245
+ """
246
+ verification = {
247
+ "Supported": "NO",
248
+ "Unsupported Claims": ["The question is not related to the provided documents."],
249
+ "Contradictions": [],
250
+ "Relevant": "NO",
251
+ "Additional Details": "The question cannot be answered using the provided documents as it is out of context."
252
+ }
253
+ return self.format_verification_report(verification)
254
+
255
+ def check(self, answer: str, documents: List[Document]) -> Dict:
256
+ """
257
+ Verify the answer against the provided documents.
258
+ """
259
+ print(f"VerificationAgent.check called with answer='{answer}' and {len(documents)} documents.")
260
+
261
+ # Combine all document contents into one string
262
+ # Limit context size to prevent token overflow (keep last 8000 chars if too long)
263
+ context_parts = [doc.page_content for doc in documents]
264
+ context = "\n\n".join(context_parts)
265
+
266
+ # Truncate context if too long (keep most recent content which is usually more relevant)
267
+ MAX_CONTEXT_LENGTH = 10000 # Approximate character limit
268
+ if len(context) > MAX_CONTEXT_LENGTH:
269
+ print(f"Context too long ({len(context)} chars), truncating to last {MAX_CONTEXT_LENGTH} chars")
270
+ context = context[-MAX_CONTEXT_LENGTH:]
271
+
272
+ print(f"Combined context length: {len(context)} characters.")
273
+
274
+ # Create a prompt for the LLM to verify the answer
275
+ prompt = self.generate_prompt(answer, context)
276
+ print("Prompt created for the LLM.")
277
+
278
+ # Call the LLM to generate the verification report
279
+ try:
280
+ print("Sending prompt to Ollama...")
281
+ response = self.llm.invoke(prompt)
282
+ print("LLM response received.")
283
+
284
+ # Extract content from LangChain message
285
+ if hasattr(response, 'content'):
286
+ llm_response = response.content
287
+ else:
288
+ llm_response = str(response)
289
+
290
+ except Exception as e:
291
+ print(f"Error during model inference: {e}")
292
+ raise RuntimeError("Failed to verify answer due to a model error.") from e
293
+
294
+ # Sanitize the response
295
+ sanitized_response = self.sanitize_response(llm_response) if llm_response else ""
296
+ if not sanitized_response:
297
+ print("LLM returned an empty response.")
298
+ verification_report = {
299
+ "Supported": "NO",
300
+ "Unsupported Claims": [],
301
+ "Contradictions": [],
302
+ "Relevant": "NO",
303
+ "Additional Details": "Empty response from the model."
304
+ }
305
+ else:
306
+ # Parse the response into the expected format
307
+ verification_report = self.parse_verification_response(sanitized_response)
308
+ if verification_report is None:
309
+ print("LLM did not respond with the expected format. Using default verification report.")
310
+ verification_report = {
311
+ "Supported": "NO",
312
+ "Unsupported Claims": [],
313
+ "Contradictions": [],
314
+ "Relevant": "NO",
315
+ "Additional Details": "Failed to parse the model's response."
316
+ }
317
+
318
+ # Format the verification report into a paragraph
319
+ verification_report_formatted = self.format_verification_report(verification_report)
320
+ print(f"Verification report:\n{verification_report_formatted}")
321
+ print(f"Context used: {context}")
322
+
323
+ return {
324
+ "verification_report": verification_report_formatted,
325
+ "context_used": context
326
+ }
agents/workflow.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph, END
2
+ from typing import TypedDict, List, Dict
3
+ from .research_agent import ResearchAgent
4
+ from .verification_agent import VerificationAgent
5
+ from .relevance_checker import RelevanceChecker
6
+ from langchain_core.documents.base import Document
7
+ # from langchain.retrievers import EnsembleRetriever
8
+ from langchain_classic.retrievers.ensemble import EnsembleRetriever
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class AgentState(TypedDict):
14
+ question: str
15
+ documents: List[Document]
16
+ draft_answer: str
17
+ verification_report: str
18
+ is_relevant: bool
19
+ retriever: EnsembleRetriever
20
+
21
+ class AgentWorkflow:
22
+ def __init__(self):
23
+ self.researcher = ResearchAgent()
24
+ self.verifier = VerificationAgent()
25
+ self.relevance_checker = RelevanceChecker()
26
+ self.compiled_workflow = self.build_workflow() # Compile once during initialization
27
+
28
+ def build_workflow(self):
29
+ """Create and compile the multi-agent workflow."""
30
+ workflow = StateGraph(AgentState)
31
+
32
+ # Add nodes
33
+ workflow.add_node("check_relevance", self._check_relevance_step)
34
+ workflow.add_node("research", self._research_step)
35
+ workflow.add_node("verify", self._verification_step)
36
+
37
+ # Define edges
38
+ workflow.set_entry_point("check_relevance")
39
+ workflow.add_conditional_edges(
40
+ "check_relevance",
41
+ self._decide_after_relevance_check,
42
+ {
43
+ "relevant": "research",
44
+ "irrelevant": END
45
+ }
46
+ )
47
+ workflow.add_edge("research", "verify")
48
+ workflow.add_conditional_edges(
49
+ "verify",
50
+ self._decide_next_step,
51
+ {
52
+ "re_research": "research",
53
+ "end": END
54
+ }
55
+ )
56
+ return workflow.compile()
57
+
58
+ def _check_relevance_step(self, state: AgentState) -> Dict:
59
+ retriever = state["retriever"]
60
+ classification = self.relevance_checker.check(
61
+ question=state["question"],
62
+ retriever=retriever,
63
+ k=20
64
+ )
65
+
66
+ if classification == "CAN_ANSWER":
67
+ # We have enough info to proceed
68
+ return {"is_relevant": True}
69
+
70
+ elif classification == "PARTIAL":
71
+ # There's partial coverage, but we can still proceed
72
+ return {
73
+ "is_relevant": True
74
+ }
75
+
76
+ else: # classification == "NO_MATCH"
77
+ # Generate verification report for out-of-context questions
78
+ verification_report = self.verifier.generate_out_of_context_report()
79
+ return {
80
+ "is_relevant": False,
81
+ "draft_answer": "This question isn't related (or there's no data) for your query. Please ask another question relevant to the uploaded document(s).",
82
+ "verification_report": verification_report
83
+ }
84
+
85
+
86
+ def _decide_after_relevance_check(self, state: AgentState) -> str:
87
+ decision = "relevant" if state["is_relevant"] else "irrelevant"
88
+ print(f"[DEBUG] _decide_after_relevance_check -> {decision}")
89
+ return decision
90
+
91
+ def full_pipeline(self, question: str, retriever: EnsembleRetriever):
92
+ try:
93
+ print(f"[DEBUG] Starting full_pipeline with question='{question}'")
94
+ documents = retriever.invoke(question)
95
+ logger.info(f"Retrieved {len(documents)} relevant documents (from .invoke)")
96
+ # print(documents)
97
+ initial_state = AgentState(
98
+ question=question,
99
+ documents=documents,
100
+ draft_answer="",
101
+ verification_report="",
102
+ is_relevant=False,
103
+ retriever=retriever
104
+ )
105
+
106
+ final_state = self.compiled_workflow.invoke(initial_state)
107
+
108
+ return {
109
+ "draft_answer": final_state["draft_answer"],
110
+ "verification_report": final_state["verification_report"]
111
+ }
112
+ except Exception as e:
113
+ logger.error(f"Workflow execution failed: {e}")
114
+ raise
115
+
116
+ def _research_step(self, state: AgentState) -> Dict:
117
+ print(f"[DEBUG] Entered _research_step with question='{state['question']}'")
118
+ result = self.researcher.generate(state["question"], state["documents"])
119
+ print("[DEBUG] Researcher returned draft answer.")
120
+ return {"draft_answer": result["draft_answer"]}
121
+
122
+ def _verification_step(self, state: AgentState) -> Dict:
123
+ print("[DEBUG] Entered _verification_step. Verifying the draft answer...")
124
+ result = self.verifier.check(state["draft_answer"], state["documents"])
125
+ print("[DEBUG] VerificationAgent returned a verification report.")
126
+ return {"verification_report": result["verification_report"]}
127
+
128
+ def _decide_next_step(self, state: AgentState) -> str:
129
+ verification_report = state["verification_report"]
130
+ print(f"[DEBUG] _decide_next_step with verification_report='{verification_report}'")
131
+ if "Supported: NO" in verification_report or "Relevant: NO" in verification_report:
132
+ logger.info("[DEBUG] Verification indicates re-research needed.")
133
+ return "re_research"
134
+ else:
135
+ logger.info("[DEBUG] Verification successful, ending workflow.")
136
+ return "end"
app.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import hashlib
3
+ from typing import List, Dict, Tuple
4
+ import os
5
+ import time
6
+
7
+ from document_processor.file_handler import DocumentProcessor
8
+ from retriever.builder import RetrieverBuilder
9
+ from agents.workflow import AgentWorkflow
10
+ from config import constants, settings
11
+ from utils.logging import logger
12
+
13
+
14
+ def main():
15
+ processor = DocumentProcessor()
16
+ retriever_builder = RetrieverBuilder()
17
+ workflow = AgentWorkflow()
18
+
19
+ # Define custom CSS for ChatGPT-like styling with dark sidebar
20
+ css = """
21
+ .sidebar {
22
+ background: #202123 !important;
23
+ border-right: 1px solid #343541 !important;
24
+ max-height: 90vh !important;
25
+ height: auto !important;
26
+ overflow-y: auto !important;
27
+ padding: 15px !important;
28
+ color: #ffffff !important;
29
+ }
30
+
31
+ .sidebar * {
32
+ color: #ffffff !important;
33
+ }
34
+
35
+ .sidebar label {
36
+ color: #d1d5db !important;
37
+ }
38
+
39
+ .sidebar input, .sidebar select, .sidebar textarea {
40
+ background: #343541 !important;
41
+ color: #ffffff !important;
42
+ border: 1px solid #565869 !important;
43
+ }
44
+
45
+ .main-container {
46
+ max-height: 90vh !important;
47
+ height: auto !important;
48
+ overflow-yL: auto !important;
49
+ }
50
+
51
+ .chat-area {
52
+ height: 100vh !important;
53
+ display: flex !important;
54
+ flex-direction: column !important;
55
+ padding: 10px !important;
56
+ }
57
+
58
+ .chatbot-container {
59
+ flex: 1 1 auto !important;
60
+ min-height: 300px !important;
61
+ max-height: calc(100vh - 280px) !important;
62
+ overflow: hidden !important;
63
+ }
64
+
65
+ .chatbot-container .gradio-chatbot {
66
+ height: 100% !important;
67
+ max-height: calc(100vh - 280px) !important;
68
+ }
69
+
70
+ .input-area {
71
+ margin-top: 10px !important;
72
+ }
73
+
74
+ .processing-status {
75
+ padding: 8px !important;
76
+ border-radius: 6px !important;
77
+ margin: 8px 0 !important;
78
+ font-size: 0.9em !important;
79
+ }
80
+
81
+ .success {
82
+ background: #d4edda !important;
83
+ color: #155724 !important;
84
+ border: 1px solid #c3e6cb !important;
85
+ }
86
+
87
+ .error {
88
+ background: #f8d7da !important;
89
+ color: #721c24 !important;
90
+ border: 1px solid #f5c6cb !important;
91
+ }
92
+
93
+ .progress-info {
94
+ font-size: 0.85em !important;
95
+ color: #666 !important;
96
+ margin-top: 5px !important;
97
+ }
98
+ """
99
+
100
+ with gr.Blocks(theme=gr.themes.Soft(), title="DocChat", css=css) as demo:
101
+ # Session state for document processing
102
+ session_state = gr.State({
103
+ "file_hashes": frozenset(),
104
+ "retriever": None,
105
+ "processed_files": [],
106
+ "chat_history": []
107
+ })
108
+
109
+ # Main layout: Sidebar + Chat
110
+ with gr.Row(equal_height=True, elem_classes="main-container"):
111
+ # Left Sidebar for file management (narrower)
112
+ with gr.Column(scale=0.7, min_width=250, elem_classes="sidebar"):
113
+ gr.Markdown("## ๐Ÿ“ Upload your documents here", elem_classes="title")
114
+
115
+ # File upload component - files shown here
116
+ files = gr.Files(
117
+ label="Upload Documents",
118
+ file_types=constants.ALLOWED_TYPES,
119
+ height=150,
120
+ show_label=True
121
+ )
122
+
123
+ # Sync button and status
124
+ with gr.Row():
125
+ sync_btn = gr.Button("๐Ÿ”„ Sync", variant="primary", scale=1, size="sm", elem_classes=["flex-item"])
126
+ with gr.Row():
127
+ sync_status_indicator = gr.HTML(
128
+ '<div style="padding: 6px; text-align: center; border-radius: 4px; background: #343541; color: #9ca3af; font-size: 0.85em; width: 100%;">Not synced</div>',
129
+ visible=True,
130
+ elem_classes=["flex-item"]
131
+ )
132
+ # Provide equal layout using some CSS tweaks
133
+ gr.HTML("""
134
+ <style>
135
+ .svelte-1ipelgc.flex-item { flex: 1 1 0 !important; min-width: 0 !important; }
136
+ </style>
137
+ """)
138
+
139
+ # Processing status (only show when processing/processed)
140
+ processing_status = gr.Markdown("", elem_classes="processing-status", visible=False)
141
+
142
+ # Verification Report Section
143
+ gr.HTML("""
144
+ <style>
145
+ .compact-markdown p,
146
+ .compact-markdown h4,
147
+ .compact-markdown h5,
148
+ .compact-markdown h6 {
149
+ margin-top: 0.25rem !important;
150
+ margin-bottom: 0.25rem !important;
151
+ }
152
+ </style>
153
+ """)
154
+
155
+ # gr.Markdown("---")
156
+ gr.Markdown('<span style="font-size: 1em;">๐Ÿ“Š Verification Report</span>', elem_classes="compact-markdown")
157
+ # gr.Markdown('<span style="font-size: 0.85em; color: #8e9297;"><em>Of the last message</em></span>', elem_classes="compact-markdown")
158
+ verification_output = gr.Textbox(
159
+ label="",
160
+ interactive=False,
161
+ lines=12,
162
+ max_lines=12,
163
+ value="",
164
+ placeholder="""### Verification Report""",
165
+ show_label=False
166
+ )
167
+
168
+ # Right side: Chat interface
169
+ with gr.Column(scale=4, elem_classes="chat-area"):
170
+ # Header section
171
+ gr.Markdown("# ๐Ÿค– GeekBot")
172
+ gr.Markdown("*Your personal AI*")
173
+ gr.Markdown("*Enter your documents and start chatting about it. Supports ppt,pdf,txt, etc*")
174
+
175
+ # Chat interface - flex to fill available space
176
+ with gr.Column(elem_classes="chatbot-container"):
177
+ chatbot = gr.Chatbot(
178
+ label="",
179
+ show_label=False,
180
+ show_copy_button=True,
181
+ avatar_images=(None, "๐Ÿค–"),
182
+ container=True,
183
+ height=550
184
+ )
185
+
186
+ # Input area
187
+ with gr.Row(elem_classes="input-area"):
188
+ msg = gr.Textbox(
189
+ label="",
190
+ placeholder="Type your question here...",
191
+ show_label=False,
192
+ scale=9,
193
+ container=False
194
+ )
195
+ submit_btn = gr.Button("Send", scale=1, variant="primary")
196
+
197
+ # Function to remove files from ChromaDB when they're removed from UI
198
+ def handle_file_removal(current_files: List, state: Dict):
199
+ """Handle file removal - clean up ChromaDB and retriever if files are removed."""
200
+ if not current_files:
201
+ # All files removed - reset retriever
202
+ if state.get("retriever"):
203
+ logger.info("All files removed. Resetting retriever.")
204
+ state.update({
205
+ "retriever": None,
206
+ "file_hashes": frozenset(),
207
+ "processed_files": []
208
+ })
209
+ return (
210
+ get_sync_status_html("ready"),
211
+ "", # processing_status
212
+ gr.update(visible=False), # processing_status visibility
213
+ state
214
+ )
215
+ return (
216
+ get_sync_status_html("ready"),
217
+ "",
218
+ gr.update(visible=False),
219
+ state
220
+ )
221
+
222
+ # Check if any files were removed
223
+ current_hashes = _get_file_hashes(current_files)
224
+ if state.get("file_hashes") and current_hashes != state["file_hashes"]:
225
+ # Files were removed - need to rebuild retriever with remaining files
226
+ logger.info("Files were removed. Rebuilding retriever with remaining files...")
227
+ try:
228
+ chunks = processor.process(current_files)
229
+ retriever = retriever_builder.build_hybrid_retriever(chunks)
230
+
231
+ state.update({
232
+ "file_hashes": current_hashes,
233
+ "retriever": retriever,
234
+ "processed_files": current_files
235
+ })
236
+
237
+ status_html = "โœ… **Documents resynced**<br>"
238
+ status_html += f"<div class='progress-info'>{len(chunks)} chunks indexed.</div>"
239
+
240
+ return (
241
+ get_sync_status_html("synced", len(chunks)),
242
+ status_html,
243
+ gr.update(visible=True),
244
+ state
245
+ )
246
+ except Exception as e:
247
+ logger.error(f"Error resyncing after file removal: {e}")
248
+ return (
249
+ get_sync_status_html("error"),
250
+ f"โŒ Error: {str(e)}",
251
+ gr.update(visible=True),
252
+ state
253
+ )
254
+
255
+ return (
256
+ get_sync_status_html("synced", len(state.get("processed_files", []))),
257
+ "",
258
+ gr.update(visible=False),
259
+ state
260
+ )
261
+
262
+ # Function to update sync status indicator
263
+ def get_sync_status_html(status: str, count: int = 0) -> str:
264
+ """Generate HTML for sync status indicator."""
265
+ if status == "synced":
266
+ return f'<div style="padding: 8px; text-align: center; border-radius: 4px; background: #16a34a; color: #ffffff; font-weight: bold;">โœ… Synced ({count} chunks)</div>'
267
+ elif status == "syncing":
268
+ return '<div style="padding: 8px; text-align: center; border-radius: 4px; background: #f59e0b; color: #ffffff; font-weight: bold;">๐Ÿ”„ Syncing...</div>'
269
+ elif status == "error":
270
+ return '<div style="padding: 8px; text-align: center; border-radius: 4px; background: #dc2626; color: #ffffff; font-weight: bold;">โŒ Error</div>'
271
+ else:
272
+ return '<div style="padding: 8px; text-align: center; border-radius: 4px; background: #343541; color: #9ca3af;">Not synced</div>'
273
+
274
+ # Function to process files (called by sync button)
275
+ def process_files(uploaded_files: List, state: Dict):
276
+ """Process files and build retriever."""
277
+ if not uploaded_files:
278
+ return (
279
+ get_sync_status_html("ready"), # sync_status
280
+ "", # processing_status
281
+ gr.update(visible=False), # processing_status visibility
282
+ state
283
+ )
284
+
285
+ try:
286
+ current_hashes = _get_file_hashes(uploaded_files)
287
+
288
+ # Check if files are new or changed
289
+ if state["retriever"] is None or current_hashes != state["file_hashes"]:
290
+ # Process documents
291
+ logger.info("Processing new/changed documents...")
292
+ chunks = processor.process(uploaded_files)
293
+ logger.info("Building retriever...")
294
+ retriever = retriever_builder.build_hybrid_retriever(chunks)
295
+ logger.info("Retriever built successfully")
296
+
297
+ state.update({
298
+ "file_hashes": current_hashes,
299
+ "retriever": retriever,
300
+ "processed_files": uploaded_files
301
+ })
302
+
303
+ status_html = "โœ… **Documents synced successfully!**<br>"
304
+ status_html += f"<div class='progress-info'>{len(chunks)} chunks indexed. Ready for questions!</div>"
305
+
306
+ return (
307
+ get_sync_status_html("synced", len(chunks)), # sync_status
308
+ status_html, # processing_status
309
+ gr.update(visible=True), # processing_status visibility
310
+ state
311
+ )
312
+ else:
313
+ # Files unchanged, already synced
314
+ status_html = "โœ… **Documents already synced**<br>"
315
+ status_html += "<div class='progress-info'>Files are up to date. Ready for questions!</div>"
316
+
317
+ # Get chunk count from state if available
318
+ chunk_count = len(state.get("processed_files", []))
319
+
320
+ return (
321
+ get_sync_status_html("synced", chunk_count), # sync_status
322
+ status_html, # processing_status
323
+ gr.update(visible=True), # processing_status visibility
324
+ state
325
+ )
326
+ except Exception as e:
327
+ error_html = f"โŒ **Error syncing documents**<br>"
328
+ error_html += f"<div class='progress-info'>{str(e)}</div>"
329
+ logger.error(f"File processing error: {str(e)}")
330
+
331
+ return (
332
+ get_sync_status_html("error"), # sync_status
333
+ error_html, # processing_status
334
+ gr.update(visible=True), # processing_status visibility
335
+ state
336
+ )
337
+
338
+
339
+ # Chat function for handling questions
340
+ def chat_function(message: str, history: List, state: Dict, verification_state: str):
341
+ """Handle chat messages and generate responses."""
342
+ try:
343
+ if not message.strip():
344
+ history.append((message, "Please enter a question."))
345
+ return history, "", state, verification_state
346
+
347
+ if state["retriever"] is None:
348
+ history.append((message, "โŒ No documents uploaded. Please upload documents first."))
349
+ return history, "", state, verification_state
350
+
351
+ # Get answer from workflow
352
+ result = workflow.full_pipeline(
353
+ question=message,
354
+ retriever=state["retriever"]
355
+ )
356
+
357
+ answer = result["draft_answer"]
358
+ verification_report = result["verification_report"]
359
+
360
+ # Add to chat history
361
+ history.append((message, answer))
362
+
363
+ # Update state
364
+ if "chat_history" not in state:
365
+ state["chat_history"] = []
366
+ state["chat_history"].append({"question": message, "answer": answer})
367
+
368
+ return history, "", state, verification_report
369
+
370
+ except Exception as e:
371
+ logger.error(f"Chat error: {str(e)}")
372
+ error_msg = f"โŒ Error: {str(e)}"
373
+ history.append((message, error_msg))
374
+ return history, "", state, ""
375
+
376
+ # Event handlers
377
+ # Handle file removal - check when files change
378
+ files.change(
379
+ fn=handle_file_removal,
380
+ inputs=[files, session_state],
381
+ outputs=[sync_status_indicator, processing_status, processing_status, session_state]
382
+ )
383
+
384
+ # Sync button to process files
385
+ sync_btn.click(
386
+ fn=process_files,
387
+ inputs=[files, session_state],
388
+ outputs=[sync_status_indicator, processing_status, processing_status, session_state],
389
+ show_progress=True
390
+ )
391
+
392
+ # Chat submission
393
+ msg.submit(
394
+ fn=chat_function,
395
+ inputs=[msg, chatbot, session_state, verification_output],
396
+ outputs=[chatbot, msg, session_state, verification_output]
397
+ )
398
+
399
+ submit_btn.click(
400
+ fn=chat_function,
401
+ inputs=[msg, chatbot, session_state, verification_output],
402
+ outputs=[chatbot, msg, session_state, verification_output]
403
+ )
404
+
405
+ demo.launch(server_name="127.0.0.1", server_port=5000, share=True)
406
+
407
+ def _get_file_hashes(uploaded_files: List) -> frozenset:
408
+ """Generate SHA-256 hashes for uploaded files."""
409
+ hashes = set()
410
+ for file in uploaded_files:
411
+ # Handle both Gradio file objects and string paths
412
+ file_path = file.name if hasattr(file, 'name') else file
413
+ try:
414
+ with open(file_path, "rb") as f:
415
+ hashes.add(hashlib.sha256(f.read()).hexdigest())
416
+ except Exception as e:
417
+ logger.error(f"Error hashing file {file_path}: {e}")
418
+ continue
419
+ return frozenset(hashes)
420
+
421
+ if __name__ == "__main__":
422
+ main()
config/__init.py__ ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .settings import settings
2
+ from .constants import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
3
+
4
+ __all__ = ["settings", "MAX_FILE_SIZE", "MAX_TOTAL_SIZE", "ALLOWED_TYPES"]
config/__pycache__/constants.cpython-310.pyc ADDED
Binary file (313 Bytes). View file
 
config/__pycache__/settings.cpython-310.pyc ADDED
Binary file (1.39 kB). View file
 
config/constants.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Maximum allowed size for a single file (50 MB)
2
+ MAX_FILE_SIZE: int = 50 * 1024 * 1024
3
+
4
+ # Maximum allowed total size for all uploaded files (200 MB)
5
+ MAX_TOTAL_SIZE: int = 200 * 1024 * 1024
6
+
7
+ # Allowed file types for upload
8
+ ALLOWED_TYPES: list = [".txt", ".pdf", ".docx", ".md"]
config/settings.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from .constants import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
3
+ import os
4
+
5
+ class Settings(BaseSettings):
6
+ # Ollama settings (local)
7
+ OLLAMA_BASE_URL: str = "http://localhost:11434"
8
+ OLLAMA_MODEL_RESEARCH: str = "llama3.2:3b"
9
+ OLLAMA_MODEL_VERIFICATION: str = "llama3.2:3b"
10
+ OLLAMA_MODEL_RELEVANCE: str = "llama3.2:3b"
11
+
12
+ # Embedding model (local)
13
+ EMBEDDING_MODEL: str = "all-MiniLM-L6-v2" # sentence-transformers model
14
+
15
+ # Optional settings with defaults
16
+ MAX_FILE_SIZE: int = MAX_FILE_SIZE
17
+ MAX_TOTAL_SIZE: int = MAX_TOTAL_SIZE
18
+ ALLOWED_TYPES: list = ALLOWED_TYPES
19
+
20
+ # Database settings
21
+ CHROMA_DB_PATH: str = "./chroma_db"
22
+ CHROMA_COLLECTION_NAME: str = "documents"
23
+
24
+ # Retrieval settings
25
+ VECTOR_SEARCH_K: int = 10
26
+ HYBRID_RETRIEVER_WEIGHTS: list = [0.4, 0.6]
27
+
28
+ # Logging settings
29
+ LOG_LEVEL: str = "INFO"
30
+
31
+ # Cache settings
32
+ CACHE_DIR: str = "document_cache"
33
+ CACHE_EXPIRE_DAYS: int = 7
34
+
35
+ class Config:
36
+ env_file = ".env"
37
+ env_file_encoding = "utf-8"
38
+
39
+ settings = Settings()
document_processor/__init.py__ ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .file_handler import DocumentProcessor
2
+
3
+ __all__ = ["DocumentProcessor"]
document_processor/__pycache__/file_handler.cpython-310.pyc ADDED
Binary file (4.17 kB). View file
 
document_processor/file_handler.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import pickle
4
+ from datetime import datetime, timedelta
5
+ from pathlib import Path
6
+ from typing import List
7
+ from docling.document_converter import DocumentConverter
8
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
9
+ from config import constants
10
+ from config.settings import settings
11
+ from utils.logging import logger
12
+
13
+ class DocumentProcessor:
14
+ def __init__(self):
15
+ self.headers = [("#", "Header 1"), ("##", "Header 2")]
16
+ self.cache_dir = Path(settings.CACHE_DIR)
17
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
18
+
19
+ def validate_files(self, files: List) -> None:
20
+ """Validate the total size of the uploaded files."""
21
+ total_size = 0
22
+ for f in files:
23
+ # Handle both Gradio file objects and string paths
24
+ file_path = f.name if hasattr(f, 'name') else f
25
+ try:
26
+ total_size += os.path.getsize(file_path)
27
+ except Exception as e:
28
+ logger.warning(f"Could not get size for {file_path}: {e}")
29
+ continue
30
+ if total_size > constants.MAX_TOTAL_SIZE:
31
+ raise ValueError(f"Total size exceeds {constants.MAX_TOTAL_SIZE//1024//1024}MB limit")
32
+
33
+ def process(self, files: List) -> List:
34
+ """Process files with caching for subsequent queries"""
35
+ self.validate_files(files)
36
+ all_chunks = []
37
+ seen_hashes = set()
38
+
39
+ for file in files:
40
+ try:
41
+ # Handle both Gradio file objects and string paths
42
+ file_path = file.name if hasattr(file, 'name') else file
43
+
44
+ # Generate content-based hash for caching
45
+ with open(file_path, "rb") as f:
46
+ file_hash = self._generate_hash(f.read())
47
+
48
+ cache_path = self.cache_dir / f"{file_hash}.pkl"
49
+
50
+ if self._is_cache_valid(cache_path):
51
+ logger.info(f"Loading from cache: {file_path}")
52
+ chunks = self._load_from_cache(cache_path)
53
+ else:
54
+ logger.info(f"Processing and caching: {file_path}")
55
+ chunks = self._process_file(file_path)
56
+ self._save_to_cache(chunks, cache_path)
57
+
58
+ # Deduplicate chunks across files
59
+ for chunk in chunks:
60
+ chunk_hash = self._generate_hash(chunk.page_content.encode())
61
+ if chunk_hash not in seen_hashes:
62
+ all_chunks.append(chunk)
63
+ seen_hashes.add(chunk_hash)
64
+
65
+ except Exception as e:
66
+ file_path_display = file.name if hasattr(file, 'name') else file
67
+ logger.error(f"Failed to process {file_path_display}: {str(e)}")
68
+ continue
69
+
70
+ logger.info(f"Total unique chunks: {len(all_chunks)}")
71
+ return all_chunks
72
+
73
+ def _process_file(self, file) -> List:
74
+ """Original processing logic with Docling"""
75
+ # Handle both Gradio file objects and string paths
76
+ file_path = file.name if hasattr(file, 'name') else file
77
+
78
+ if not file_path.endswith(('.pdf', '.docx', '.txt', '.md')):
79
+ logger.warning(f"Skipping unsupported file type: {file_path}")
80
+ return []
81
+
82
+ converter = DocumentConverter()
83
+ markdown = converter.convert(file_path).document.export_to_markdown()
84
+ splitter = MarkdownHeaderTextSplitter(self.headers)
85
+ return splitter.split_text(markdown)
86
+
87
+ def _generate_hash(self, content: bytes) -> str:
88
+ return hashlib.sha256(content).hexdigest()
89
+
90
+ def _save_to_cache(self, chunks: List, cache_path: Path):
91
+ with open(cache_path, "wb") as f:
92
+ pickle.dump({
93
+ "timestamp": datetime.now().timestamp(),
94
+ "chunks": chunks
95
+ }, f)
96
+
97
+ def _load_from_cache(self, cache_path: Path) -> List:
98
+ with open(cache_path, "rb") as f:
99
+ data = pickle.load(f)
100
+ return data["chunks"]
101
+
102
+ def _is_cache_valid(self, cache_path: Path) -> bool:
103
+ if not cache_path.exists():
104
+ return False
105
+
106
+ cache_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
107
+ return cache_age < timedelta(days=settings.CACHE_EXPIRE_DAYS)
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "docchat-adnan"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "chromadb>=1.3.0",
9
+ "docling>=2.59.0",
10
+ "gradio>=5.49.1",
11
+ "langchain>=1.0.3",
12
+ "langchain-community>=0.4.1",
13
+ "langchain-ollama>=1.0.0",
14
+ "langchain-text-splitters>=1.0.0",
15
+ "langgraph>=1.0.2",
16
+ "loguru>=0.7.3",
17
+ "rank-bm25>=0.2.2",
18
+ "sentence-transformers>=5.1.2",
19
+ ]
requirements-dev.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.11.0
2
+ aiofiles==24.1.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.13.2
5
+ aiosignal==1.4.0
6
+ annotated-doc==0.0.3
7
+ annotated-types==0.7.0
8
+ antlr4-python3-runtime==4.9.3 ; python_full_version < '3.14'
9
+ anyio==4.11.0
10
+ async-timeout==4.0.3 ; python_full_version < '3.11'
11
+ attrs==25.4.0
12
+ audioop-lts==0.2.2 ; python_full_version >= '3.13'
13
+ backoff==2.2.1
14
+ bcrypt==5.0.0
15
+ beautifulsoup4==4.14.2
16
+ brotli==1.1.0
17
+ build==1.3.0
18
+ cachetools==6.2.1
19
+ certifi==2025.10.5
20
+ charset-normalizer==3.4.4
21
+ chromadb==1.3.0
22
+ click==8.3.0
23
+ colorama==0.4.6 ; (os_name != 'nt' and sys_platform == 'win32') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')
24
+ coloredlogs==15.0.1
25
+ colorlog==6.10.1 ; python_full_version < '3.14'
26
+ dataclasses-json==0.6.7
27
+ dill==0.4.0
28
+ distro==1.9.0
29
+ docling==2.59.0
30
+ docling-core==2.50.0
31
+ docling-ibm-models==3.10.2
32
+ docling-parse==4.7.0
33
+ durationpy==0.10
34
+ et-xmlfile==2.0.0
35
+ exceptiongroup==1.3.0 ; python_full_version < '3.11'
36
+ faker==37.12.0
37
+ fastapi==0.120.4
38
+ ffmpy==0.6.4
39
+ filelock==3.20.0
40
+ filetype==1.2.0
41
+ flatbuffers==25.9.23
42
+ frozenlist==1.8.0
43
+ fsspec==2025.10.0
44
+ google-auth==2.42.1
45
+ googleapis-common-protos==1.71.0
46
+ gradio==5.49.1
47
+ gradio-client==1.13.3
48
+ greenlet==3.2.4 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
49
+ groovy==0.1.2
50
+ grpcio==1.76.0
51
+ h11==0.16.0
52
+ hf-xet==1.2.0 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
53
+ httpcore==1.0.9
54
+ httptools==0.7.1
55
+ httpx==0.28.1
56
+ httpx-sse==0.4.3
57
+ huggingface-hub==0.36.0
58
+ humanfriendly==10.0
59
+ idna==3.11
60
+ importlib-metadata==8.7.0
61
+ importlib-resources==6.5.2
62
+ jinja2==3.1.6
63
+ joblib==1.5.2
64
+ jsonlines==4.0.0
65
+ jsonpatch==1.33
66
+ jsonpointer==3.0.0
67
+ jsonref==1.1.0
68
+ jsonschema==4.25.1
69
+ jsonschema-specifications==2025.9.1
70
+ kubernetes==34.1.0
71
+ langchain==1.0.3
72
+ langchain-classic==1.0.0
73
+ langchain-community==0.4.1
74
+ langchain-core==1.0.2
75
+ langchain-ollama==1.0.0
76
+ langchain-text-splitters==1.0.0
77
+ langgraph==1.0.2
78
+ langgraph-checkpoint==3.0.0
79
+ langgraph-prebuilt==1.0.2
80
+ langgraph-sdk==0.2.9
81
+ langsmith==0.4.38
82
+ latex2mathml==3.78.1
83
+ loguru==0.7.3
84
+ lxml==6.0.2
85
+ markdown-it-py==4.0.0
86
+ marko==2.2.1
87
+ markupsafe==3.0.3
88
+ marshmallow==3.26.1
89
+ mdurl==0.1.2
90
+ mmh3==5.2.0
91
+ mpire==2.10.2
92
+ mpmath==1.3.0
93
+ multidict==6.7.0
94
+ multiprocess==0.70.18
95
+ mypy-extensions==1.1.0
96
+ networkx==3.4.2 ; python_full_version < '3.11'
97
+ networkx==3.5 ; python_full_version >= '3.11'
98
+ numpy==2.2.6 ; python_full_version < '3.11'
99
+ numpy==2.3.4 ; python_full_version >= '3.11'
100
+ nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
101
+ nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
102
+ nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
103
+ nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
104
+ nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
105
+ nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
106
+ nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
107
+ nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
108
+ nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
109
+ nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
110
+ nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
111
+ nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
112
+ nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
113
+ nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
114
+ nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
115
+ oauthlib==3.3.1
116
+ ocrmac==1.0.0 ; sys_platform == 'darwin'
117
+ ollama==0.6.0
118
+ omegaconf==2.3.0 ; python_full_version < '3.14'
119
+ onnxruntime==1.23.2
120
+ opencv-python==4.11.0.86 ; python_full_version < '3.14'
121
+ openpyxl==3.1.5
122
+ opentelemetry-api==1.38.0
123
+ opentelemetry-exporter-otlp-proto-common==1.38.0
124
+ opentelemetry-exporter-otlp-proto-grpc==1.38.0
125
+ opentelemetry-proto==1.38.0
126
+ opentelemetry-sdk==1.38.0
127
+ opentelemetry-semantic-conventions==0.59b0
128
+ orjson==3.11.4
129
+ ormsgpack==1.11.0
130
+ overrides==7.7.0
131
+ packaging==25.0
132
+ pandas==2.3.3
133
+ pillow==11.3.0
134
+ pluggy==1.6.0
135
+ polyfactory==2.22.3
136
+ posthog==5.4.0
137
+ propcache==0.4.1
138
+ protobuf==6.33.0
139
+ psutil==7.1.2
140
+ pyasn1==0.6.1
141
+ pyasn1-modules==0.4.2
142
+ pybase64==1.4.2
143
+ pyclipper==1.3.0.post6 ; python_full_version < '3.14'
144
+ pydantic==2.11.10
145
+ pydantic-core==2.33.2
146
+ pydantic-settings==2.11.0
147
+ pydub==0.25.1
148
+ pygments==2.19.2
149
+ pylatexenc==2.10
150
+ pyobjc-core==12.0 ; sys_platform == 'darwin'
151
+ pyobjc-framework-cocoa==12.0 ; sys_platform == 'darwin'
152
+ pyobjc-framework-coreml==12.0 ; sys_platform == 'darwin'
153
+ pyobjc-framework-quartz==12.0 ; sys_platform == 'darwin'
154
+ pyobjc-framework-vision==12.0 ; sys_platform == 'darwin'
155
+ pypdfium2==4.30.0
156
+ pypika==0.48.9
157
+ pyproject-hooks==1.2.0
158
+ pyreadline3==3.5.4 ; sys_platform == 'win32'
159
+ python-dateutil==2.9.0.post0
160
+ python-docx==1.2.0
161
+ python-dotenv==1.2.1
162
+ python-multipart==0.0.20
163
+ python-pptx==1.0.2
164
+ pytz==2025.2
165
+ pywin32==311 ; sys_platform == 'win32'
166
+ pyyaml==6.0.3
167
+ rank-bm25==0.2.2
168
+ rapidocr==3.4.2 ; python_full_version < '3.14'
169
+ referencing==0.37.0
170
+ regex==2025.10.23
171
+ requests==2.32.5
172
+ requests-oauthlib==2.0.0
173
+ requests-toolbelt==1.0.0
174
+ rich==14.2.0
175
+ rpds-py==0.28.0
176
+ rsa==4.9.1
177
+ rtree==1.4.1
178
+ ruff==0.14.3
179
+ safehttpx==0.1.7
180
+ safetensors==0.6.2
181
+ scikit-learn==1.7.2
182
+ scipy==1.15.3 ; python_full_version < '3.11'
183
+ scipy==1.16.3 ; python_full_version >= '3.11'
184
+ semantic-version==2.10.0
185
+ semchunk==2.2.2
186
+ sentence-transformers==5.1.2
187
+ setuptools==80.9.0 ; python_full_version >= '3.12'
188
+ shapely==2.1.2 ; python_full_version < '3.14'
189
+ shellingham==1.5.4
190
+ six==1.17.0
191
+ sniffio==1.3.1
192
+ soupsieve==2.8
193
+ sqlalchemy==2.0.44
194
+ starlette==0.49.2
195
+ sympy==1.14.0
196
+ tabulate==0.9.0
197
+ tenacity==9.1.2
198
+ threadpoolctl==3.6.0
199
+ tokenizers==0.22.1
200
+ tomli==2.3.0 ; python_full_version < '3.11'
201
+ tomlkit==0.13.3
202
+ torch==2.9.0
203
+ torchvision==0.24.0
204
+ tqdm==4.67.1
205
+ transformers==4.57.1
206
+ triton==3.5.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
207
+ typer==0.19.2
208
+ typing-extensions==4.15.0
209
+ typing-inspect==0.9.0
210
+ typing-inspection==0.4.2
211
+ tzdata==2025.2
212
+ urllib3==2.3.0
213
+ uvicorn==0.38.0
214
+ uvloop==0.22.1 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'
215
+ watchfiles==1.1.1
216
+ websocket-client==1.9.0
217
+ websockets==15.0.1
218
+ win32-setctime==1.2.0 ; sys_platform == 'win32'
219
+ xlsxwriter==3.2.9
220
+ xxhash==3.6.0
221
+ yarl==1.22.0
222
+ zipp==3.23.0
223
+ zstandard==0.25.0
requirements.txt ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export --no-hashes --format requirements-txt
3
+ accelerate==1.11.0
4
+ # via
5
+ # docling
6
+ # docling-ibm-models
7
+ aiofiles==24.1.0
8
+ # via gradio
9
+ aiohappyeyeballs==2.6.1
10
+ # via aiohttp
11
+ aiohttp==3.13.2
12
+ # via langchain-community
13
+ aiosignal==1.4.0
14
+ # via aiohttp
15
+ annotated-doc==0.0.3
16
+ # via fastapi
17
+ annotated-types==0.7.0
18
+ # via pydantic
19
+ antlr4-python3-runtime==4.9.3 ; python_full_version < '3.14'
20
+ # via omegaconf
21
+ anyio==4.11.0
22
+ # via
23
+ # gradio
24
+ # httpx
25
+ # starlette
26
+ # watchfiles
27
+ async-timeout==4.0.3 ; python_full_version < '3.11'
28
+ # via
29
+ # aiohttp
30
+ # langchain-classic
31
+ attrs==25.4.0
32
+ # via
33
+ # aiohttp
34
+ # jsonlines
35
+ # jsonschema
36
+ # referencing
37
+ audioop-lts==0.2.2 ; python_full_version >= '3.13'
38
+ # via gradio
39
+ backoff==2.2.1
40
+ # via posthog
41
+ bcrypt==5.0.0
42
+ # via chromadb
43
+ beautifulsoup4==4.14.2
44
+ # via docling
45
+ brotli==1.1.0
46
+ # via gradio
47
+ build==1.3.0
48
+ # via chromadb
49
+ cachetools==6.2.1
50
+ # via google-auth
51
+ certifi==2025.10.5
52
+ # via
53
+ # docling
54
+ # httpcore
55
+ # httpx
56
+ # kubernetes
57
+ # requests
58
+ charset-normalizer==3.4.4
59
+ # via requests
60
+ chromadb==1.3.0
61
+ # via docchat-adnan
62
+ click==8.3.0
63
+ # via
64
+ # ocrmac
65
+ # typer
66
+ # uvicorn
67
+ colorama==0.4.6 ; (os_name != 'nt' and sys_platform == 'win32') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')
68
+ # via
69
+ # build
70
+ # click
71
+ # colorlog
72
+ # loguru
73
+ # tqdm
74
+ # uvicorn
75
+ coloredlogs==15.0.1
76
+ # via onnxruntime
77
+ colorlog==6.10.1 ; python_full_version < '3.14'
78
+ # via rapidocr
79
+ dataclasses-json==0.6.7
80
+ # via langchain-community
81
+ dill==0.4.0
82
+ # via multiprocess
83
+ distro==1.9.0
84
+ # via posthog
85
+ docling==2.59.0
86
+ # via docchat-adnan
87
+ docling-core==2.50.0
88
+ # via
89
+ # docling
90
+ # docling-ibm-models
91
+ # docling-parse
92
+ docling-ibm-models==3.10.2
93
+ # via docling
94
+ docling-parse==4.7.0
95
+ # via docling
96
+ durationpy==0.10
97
+ # via kubernetes
98
+ et-xmlfile==2.0.0
99
+ # via openpyxl
100
+ exceptiongroup==1.3.0 ; python_full_version < '3.11'
101
+ # via anyio
102
+ faker==37.12.0
103
+ # via polyfactory
104
+ fastapi==0.120.4
105
+ # via gradio
106
+ ffmpy==0.6.4
107
+ # via gradio
108
+ filelock==3.20.0
109
+ # via
110
+ # huggingface-hub
111
+ # torch
112
+ # transformers
113
+ filetype==1.2.0
114
+ # via docling
115
+ flatbuffers==25.9.23
116
+ # via onnxruntime
117
+ frozenlist==1.8.0
118
+ # via
119
+ # aiohttp
120
+ # aiosignal
121
+ fsspec==2025.10.0
122
+ # via
123
+ # gradio-client
124
+ # huggingface-hub
125
+ # torch
126
+ google-auth==2.42.1
127
+ # via kubernetes
128
+ googleapis-common-protos==1.71.0
129
+ # via opentelemetry-exporter-otlp-proto-grpc
130
+ gradio==5.49.1
131
+ # via docchat-adnan
132
+ gradio-client==1.13.3
133
+ # via gradio
134
+ greenlet==3.2.4 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
135
+ # via sqlalchemy
136
+ groovy==0.1.2
137
+ # via gradio
138
+ grpcio==1.76.0
139
+ # via
140
+ # chromadb
141
+ # opentelemetry-exporter-otlp-proto-grpc
142
+ h11==0.16.0
143
+ # via
144
+ # httpcore
145
+ # uvicorn
146
+ hf-xet==1.2.0 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
147
+ # via huggingface-hub
148
+ httpcore==1.0.9
149
+ # via httpx
150
+ httptools==0.7.1
151
+ # via uvicorn
152
+ httpx==0.28.1
153
+ # via
154
+ # chromadb
155
+ # gradio
156
+ # gradio-client
157
+ # langgraph-sdk
158
+ # langsmith
159
+ # ollama
160
+ # safehttpx
161
+ httpx-sse==0.4.3
162
+ # via langchain-community
163
+ huggingface-hub==0.36.0
164
+ # via
165
+ # accelerate
166
+ # docling
167
+ # docling-ibm-models
168
+ # gradio
169
+ # gradio-client
170
+ # sentence-transformers
171
+ # tokenizers
172
+ # transformers
173
+ humanfriendly==10.0
174
+ # via coloredlogs
175
+ idna==3.11
176
+ # via
177
+ # anyio
178
+ # httpx
179
+ # requests
180
+ # yarl
181
+ importlib-metadata==8.7.0
182
+ # via
183
+ # build
184
+ # opentelemetry-api
185
+ importlib-resources==6.5.2
186
+ # via chromadb
187
+ jinja2==3.1.6
188
+ # via
189
+ # gradio
190
+ # torch
191
+ joblib==1.5.2
192
+ # via scikit-learn
193
+ jsonlines==4.0.0
194
+ # via docling-ibm-models
195
+ jsonpatch==1.33
196
+ # via langchain-core
197
+ jsonpointer==3.0.0
198
+ # via jsonpatch
199
+ jsonref==1.1.0
200
+ # via docling-core
201
+ jsonschema==4.25.1
202
+ # via
203
+ # chromadb
204
+ # docling-core
205
+ jsonschema-specifications==2025.9.1
206
+ # via jsonschema
207
+ kubernetes==34.1.0
208
+ # via chromadb
209
+ langchain==1.0.3
210
+ # via docchat-adnan
211
+ langchain-classic==1.0.0
212
+ # via langchain-community
213
+ langchain-community==0.4.1
214
+ # via docchat-adnan
215
+ langchain-core==1.0.2
216
+ # via
217
+ # langchain
218
+ # langchain-classic
219
+ # langchain-community
220
+ # langchain-ollama
221
+ # langchain-text-splitters
222
+ # langgraph
223
+ # langgraph-checkpoint
224
+ # langgraph-prebuilt
225
+ langchain-ollama==1.0.0
226
+ # via docchat-adnan
227
+ langchain-text-splitters==1.0.0
228
+ # via
229
+ # docchat-adnan
230
+ # langchain-classic
231
+ langgraph==1.0.2
232
+ # via
233
+ # docchat-adnan
234
+ # langchain
235
+ langgraph-checkpoint==3.0.0
236
+ # via
237
+ # langgraph
238
+ # langgraph-prebuilt
239
+ langgraph-prebuilt==1.0.2
240
+ # via langgraph
241
+ langgraph-sdk==0.2.9
242
+ # via langgraph
243
+ langsmith==0.4.38
244
+ # via
245
+ # langchain-classic
246
+ # langchain-community
247
+ # langchain-core
248
+ latex2mathml==3.78.1
249
+ # via docling-core
250
+ loguru==0.7.3
251
+ # via docchat-adnan
252
+ lxml==6.0.2
253
+ # via
254
+ # docling
255
+ # python-docx
256
+ # python-pptx
257
+ markdown-it-py==4.0.0
258
+ # via rich
259
+ marko==2.2.1
260
+ # via docling
261
+ markupsafe==3.0.3
262
+ # via
263
+ # gradio
264
+ # jinja2
265
+ marshmallow==3.26.1
266
+ # via dataclasses-json
267
+ mdurl==0.1.2
268
+ # via markdown-it-py
269
+ mmh3==5.2.0
270
+ # via chromadb
271
+ mpire==2.10.2
272
+ # via semchunk
273
+ mpmath==1.3.0
274
+ # via sympy
275
+ multidict==6.7.0
276
+ # via
277
+ # aiohttp
278
+ # yarl
279
+ multiprocess==0.70.18
280
+ # via mpire
281
+ mypy-extensions==1.1.0
282
+ # via typing-inspect
283
+ networkx==3.4.2 ; python_full_version < '3.11'
284
+ # via torch
285
+ networkx==3.5 ; python_full_version >= '3.11'
286
+ # via torch
287
+ numpy==2.2.6 ; python_full_version < '3.11'
288
+ # via
289
+ # accelerate
290
+ # chromadb
291
+ # docling-ibm-models
292
+ # gradio
293
+ # langchain-community
294
+ # onnxruntime
295
+ # opencv-python
296
+ # pandas
297
+ # rank-bm25
298
+ # rapidocr
299
+ # safetensors
300
+ # scikit-learn
301
+ # scipy
302
+ # shapely
303
+ # torchvision
304
+ # transformers
305
+ numpy==2.3.4 ; python_full_version >= '3.11'
306
+ # via
307
+ # accelerate
308
+ # chromadb
309
+ # docling-ibm-models
310
+ # gradio
311
+ # langchain-community
312
+ # onnxruntime
313
+ # opencv-python
314
+ # pandas
315
+ # rank-bm25
316
+ # rapidocr
317
+ # safetensors
318
+ # scikit-learn
319
+ # scipy
320
+ # shapely
321
+ # torchvision
322
+ # transformers
323
+ nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
324
+ # via
325
+ # nvidia-cudnn-cu12
326
+ # nvidia-cusolver-cu12
327
+ # torch
328
+ nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
329
+ # via torch
330
+ nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
331
+ # via torch
332
+ nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
333
+ # via torch
334
+ nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
335
+ # via torch
336
+ nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
337
+ # via torch
338
+ nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
339
+ # via torch
340
+ nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
341
+ # via torch
342
+ nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
343
+ # via torch
344
+ nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
345
+ # via
346
+ # nvidia-cusolver-cu12
347
+ # torch
348
+ nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
349
+ # via torch
350
+ nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
351
+ # via torch
352
+ nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
353
+ # via
354
+ # nvidia-cufft-cu12
355
+ # nvidia-cusolver-cu12
356
+ # nvidia-cusparse-cu12
357
+ # torch
358
+ nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
359
+ # via torch
360
+ nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
361
+ # via torch
362
+ oauthlib==3.3.1
363
+ # via requests-oauthlib
364
+ ocrmac==1.0.0 ; sys_platform == 'darwin'
365
+ # via docling
366
+ ollama==0.6.0
367
+ # via langchain-ollama
368
+ omegaconf==2.3.0 ; python_full_version < '3.14'
369
+ # via rapidocr
370
+ onnxruntime==1.23.2
371
+ # via chromadb
372
+ opencv-python==4.11.0.86 ; python_full_version < '3.14'
373
+ # via rapidocr
374
+ openpyxl==3.1.5
375
+ # via docling
376
+ opentelemetry-api==1.38.0
377
+ # via
378
+ # chromadb
379
+ # opentelemetry-exporter-otlp-proto-grpc
380
+ # opentelemetry-sdk
381
+ # opentelemetry-semantic-conventions
382
+ opentelemetry-exporter-otlp-proto-common==1.38.0
383
+ # via opentelemetry-exporter-otlp-proto-grpc
384
+ opentelemetry-exporter-otlp-proto-grpc==1.38.0
385
+ # via chromadb
386
+ opentelemetry-proto==1.38.0
387
+ # via
388
+ # opentelemetry-exporter-otlp-proto-common
389
+ # opentelemetry-exporter-otlp-proto-grpc
390
+ opentelemetry-sdk==1.38.0
391
+ # via
392
+ # chromadb
393
+ # opentelemetry-exporter-otlp-proto-grpc
394
+ opentelemetry-semantic-conventions==0.59b0
395
+ # via opentelemetry-sdk
396
+ orjson==3.11.4
397
+ # via
398
+ # chromadb
399
+ # gradio
400
+ # langgraph-sdk
401
+ # langsmith
402
+ ormsgpack==1.11.0
403
+ # via langgraph-checkpoint
404
+ overrides==7.7.0
405
+ # via chromadb
406
+ packaging==25.0
407
+ # via
408
+ # accelerate
409
+ # build
410
+ # gradio
411
+ # gradio-client
412
+ # huggingface-hub
413
+ # langchain-core
414
+ # langsmith
415
+ # marshmallow
416
+ # onnxruntime
417
+ # transformers
418
+ pandas==2.3.3
419
+ # via
420
+ # docling
421
+ # docling-core
422
+ # gradio
423
+ pillow==11.3.0
424
+ # via
425
+ # docling
426
+ # docling-core
427
+ # docling-ibm-models
428
+ # docling-parse
429
+ # gradio
430
+ # ocrmac
431
+ # python-pptx
432
+ # rapidocr
433
+ # sentence-transformers
434
+ # torchvision
435
+ pluggy==1.6.0
436
+ # via docling
437
+ polyfactory==2.22.3
438
+ # via docling
439
+ posthog==5.4.0
440
+ # via chromadb
441
+ propcache==0.4.1
442
+ # via
443
+ # aiohttp
444
+ # yarl
445
+ protobuf==6.33.0
446
+ # via
447
+ # googleapis-common-protos
448
+ # onnxruntime
449
+ # opentelemetry-proto
450
+ psutil==7.1.2
451
+ # via accelerate
452
+ pyasn1==0.6.1
453
+ # via
454
+ # pyasn1-modules
455
+ # rsa
456
+ pyasn1-modules==0.4.2
457
+ # via google-auth
458
+ pybase64==1.4.2
459
+ # via chromadb
460
+ pyclipper==1.3.0.post6 ; python_full_version < '3.14'
461
+ # via rapidocr
462
+ pydantic==2.11.10
463
+ # via
464
+ # chromadb
465
+ # docling
466
+ # docling-core
467
+ # docling-ibm-models
468
+ # docling-parse
469
+ # fastapi
470
+ # gradio
471
+ # langchain
472
+ # langchain-classic
473
+ # langchain-core
474
+ # langgraph
475
+ # langsmith
476
+ # ollama
477
+ # pydantic-settings
478
+ pydantic-core==2.33.2
479
+ # via pydantic
480
+ pydantic-settings==2.11.0
481
+ # via
482
+ # docling
483
+ # langchain-community
484
+ pydub==0.25.1
485
+ # via gradio
486
+ pygments==2.19.2
487
+ # via
488
+ # mpire
489
+ # rich
490
+ pylatexenc==2.10
491
+ # via docling
492
+ pyobjc-core==12.0 ; sys_platform == 'darwin'
493
+ # via
494
+ # pyobjc-framework-cocoa
495
+ # pyobjc-framework-coreml
496
+ # pyobjc-framework-quartz
497
+ # pyobjc-framework-vision
498
+ pyobjc-framework-cocoa==12.0 ; sys_platform == 'darwin'
499
+ # via
500
+ # pyobjc-framework-coreml
501
+ # pyobjc-framework-quartz
502
+ # pyobjc-framework-vision
503
+ pyobjc-framework-coreml==12.0 ; sys_platform == 'darwin'
504
+ # via pyobjc-framework-vision
505
+ pyobjc-framework-quartz==12.0 ; sys_platform == 'darwin'
506
+ # via pyobjc-framework-vision
507
+ pyobjc-framework-vision==12.0 ; sys_platform == 'darwin'
508
+ # via ocrmac
509
+ pypdfium2==4.30.0
510
+ # via docling
511
+ pypika==0.48.9
512
+ # via chromadb
513
+ pyproject-hooks==1.2.0
514
+ # via build
515
+ pyreadline3==3.5.4 ; sys_platform == 'win32'
516
+ # via humanfriendly
517
+ python-dateutil==2.9.0.post0
518
+ # via
519
+ # kubernetes
520
+ # pandas
521
+ # posthog
522
+ python-docx==1.2.0
523
+ # via docling
524
+ python-dotenv==1.2.1
525
+ # via
526
+ # pydantic-settings
527
+ # uvicorn
528
+ python-multipart==0.0.20
529
+ # via gradio
530
+ python-pptx==1.0.2
531
+ # via docling
532
+ pytz==2025.2
533
+ # via pandas
534
+ pywin32==311 ; sys_platform == 'win32'
535
+ # via
536
+ # docling-parse
537
+ # mpire
538
+ pyyaml==6.0.3
539
+ # via
540
+ # accelerate
541
+ # chromadb
542
+ # docling-core
543
+ # gradio
544
+ # huggingface-hub
545
+ # kubernetes
546
+ # langchain-classic
547
+ # langchain-community
548
+ # langchain-core
549
+ # omegaconf
550
+ # rapidocr
551
+ # transformers
552
+ # uvicorn
553
+ rank-bm25==0.2.2
554
+ # via docchat-adnan
555
+ rapidocr==3.4.2 ; python_full_version < '3.14'
556
+ # via docling
557
+ referencing==0.37.0
558
+ # via
559
+ # jsonschema
560
+ # jsonschema-specifications
561
+ regex==2025.10.23
562
+ # via transformers
563
+ requests==2.32.5
564
+ # via
565
+ # docling
566
+ # huggingface-hub
567
+ # kubernetes
568
+ # langchain-classic
569
+ # langchain-community
570
+ # langsmith
571
+ # posthog
572
+ # rapidocr
573
+ # requests-oauthlib
574
+ # requests-toolbelt
575
+ # transformers
576
+ requests-oauthlib==2.0.0
577
+ # via kubernetes
578
+ requests-toolbelt==1.0.0
579
+ # via langsmith
580
+ rich==14.2.0
581
+ # via
582
+ # chromadb
583
+ # typer
584
+ rpds-py==0.28.0
585
+ # via
586
+ # jsonschema
587
+ # referencing
588
+ rsa==4.9.1
589
+ # via google-auth
590
+ rtree==1.4.1
591
+ # via
592
+ # docling
593
+ # docling-ibm-models
594
+ ruff==0.14.3
595
+ # via gradio
596
+ safehttpx==0.1.7
597
+ # via gradio
598
+ safetensors==0.6.2
599
+ # via
600
+ # accelerate
601
+ # docling-ibm-models
602
+ # transformers
603
+ scikit-learn==1.7.2
604
+ # via sentence-transformers
605
+ scipy==1.15.3 ; python_full_version < '3.11'
606
+ # via
607
+ # docling
608
+ # scikit-learn
609
+ # sentence-transformers
610
+ scipy==1.16.3 ; python_full_version >= '3.11'
611
+ # via
612
+ # docling
613
+ # scikit-learn
614
+ # sentence-transformers
615
+ semantic-version==2.10.0
616
+ # via gradio
617
+ semchunk==2.2.2
618
+ # via docling-core
619
+ sentence-transformers==5.1.2
620
+ # via docchat-adnan
621
+ setuptools==80.9.0 ; python_full_version >= '3.12'
622
+ # via torch
623
+ shapely==2.1.2 ; python_full_version < '3.14'
624
+ # via rapidocr
625
+ shellingham==1.5.4
626
+ # via typer
627
+ six==1.17.0
628
+ # via
629
+ # kubernetes
630
+ # posthog
631
+ # python-dateutil
632
+ # rapidocr
633
+ sniffio==1.3.1
634
+ # via anyio
635
+ soupsieve==2.8
636
+ # via beautifulsoup4
637
+ sqlalchemy==2.0.44
638
+ # via
639
+ # langchain-classic
640
+ # langchain-community
641
+ starlette==0.49.2
642
+ # via
643
+ # fastapi
644
+ # gradio
645
+ sympy==1.14.0
646
+ # via
647
+ # onnxruntime
648
+ # torch
649
+ tabulate==0.9.0
650
+ # via
651
+ # docling-core
652
+ # docling-parse
653
+ tenacity==9.1.2
654
+ # via
655
+ # chromadb
656
+ # langchain-community
657
+ # langchain-core
658
+ threadpoolctl==3.6.0
659
+ # via scikit-learn
660
+ tokenizers==0.22.1
661
+ # via
662
+ # chromadb
663
+ # transformers
664
+ tomli==2.3.0 ; python_full_version < '3.11'
665
+ # via build
666
+ tomlkit==0.13.3
667
+ # via gradio
668
+ torch==2.9.0
669
+ # via
670
+ # accelerate
671
+ # docling-ibm-models
672
+ # safetensors
673
+ # sentence-transformers
674
+ # torchvision
675
+ torchvision==0.24.0
676
+ # via docling-ibm-models
677
+ tqdm==4.67.1
678
+ # via
679
+ # chromadb
680
+ # docling
681
+ # docling-ibm-models
682
+ # huggingface-hub
683
+ # mpire
684
+ # rapidocr
685
+ # semchunk
686
+ # sentence-transformers
687
+ # transformers
688
+ transformers==4.57.1
689
+ # via
690
+ # docling-core
691
+ # docling-ibm-models
692
+ # sentence-transformers
693
+ triton==3.5.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
694
+ # via torch
695
+ typer==0.19.2
696
+ # via
697
+ # chromadb
698
+ # docling
699
+ # docling-core
700
+ # gradio
701
+ typing-extensions==4.15.0
702
+ # via
703
+ # aiosignal
704
+ # anyio
705
+ # beautifulsoup4
706
+ # chromadb
707
+ # docling-core
708
+ # exceptiongroup
709
+ # fastapi
710
+ # gradio
711
+ # gradio-client
712
+ # grpcio
713
+ # huggingface-hub
714
+ # langchain-core
715
+ # multidict
716
+ # opentelemetry-api
717
+ # opentelemetry-exporter-otlp-proto-grpc
718
+ # opentelemetry-sdk
719
+ # opentelemetry-semantic-conventions
720
+ # polyfactory
721
+ # pydantic
722
+ # pydantic-core
723
+ # python-docx
724
+ # python-pptx
725
+ # referencing
726
+ # sentence-transformers
727
+ # sqlalchemy
728
+ # starlette
729
+ # torch
730
+ # typer
731
+ # typing-inspect
732
+ # typing-inspection
733
+ # uvicorn
734
+ typing-inspect==0.9.0
735
+ # via dataclasses-json
736
+ typing-inspection==0.4.2
737
+ # via
738
+ # pydantic
739
+ # pydantic-settings
740
+ tzdata==2025.2
741
+ # via
742
+ # faker
743
+ # pandas
744
+ urllib3==2.3.0
745
+ # via
746
+ # kubernetes
747
+ # requests
748
+ uvicorn==0.38.0
749
+ # via
750
+ # chromadb
751
+ # gradio
752
+ uvloop==0.22.1 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'
753
+ # via uvicorn
754
+ watchfiles==1.1.1
755
+ # via uvicorn
756
+ websocket-client==1.9.0
757
+ # via kubernetes
758
+ websockets==15.0.1
759
+ # via
760
+ # gradio-client
761
+ # uvicorn
762
+ win32-setctime==1.2.0 ; sys_platform == 'win32'
763
+ # via loguru
764
+ xlsxwriter==3.2.9
765
+ # via python-pptx
766
+ xxhash==3.6.0
767
+ # via langgraph
768
+ yarl==1.22.0
769
+ # via aiohttp
770
+ zipp==3.23.0
771
+ # via importlib-metadata
772
+ zstandard==0.25.0
773
+ # via langsmith
retriever/__init.py__ ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .builder import RetrieverBuilder
2
+
3
+ __all__ = ["RetrieverBuilder"]
retriever/__pycache__/builder.cpython-310.pyc ADDED
Binary file (3.26 kB). View file
 
retriever/builder.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import Chroma
2
+ from langchain_community.retrievers import BM25Retriever
3
+ # from langchain.retrievers import EnsembleRetriever
4
+ from langchain_classic.retrievers.ensemble import EnsembleRetriever
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from config.settings import settings
7
+ import logging
8
+ import os
9
+ import hashlib
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class RetrieverBuilder:
14
+ def __init__(self):
15
+ """Initialize the retriever builder with local embeddings."""
16
+ # Use sentence-transformers for local embeddings
17
+ self.embeddings = HuggingFaceEmbeddings(
18
+ model_name=settings.EMBEDDING_MODEL,
19
+ model_kwargs={'device': 'cpu'}, # Use 'cuda' if you have GPU
20
+ encode_kwargs={'normalize_embeddings': True}
21
+ )
22
+ logger.info(f"Initialized local embeddings: {settings.EMBEDDING_MODEL}")
23
+
24
+ def build_hybrid_retriever(self, docs):
25
+ """Build a hybrid retriever using BM25 and vector-based retrieval.
26
+ Reuses existing ChromaDB if available and only adds new documents.
27
+ """
28
+ try:
29
+ # Check if ChromaDB already exists
30
+ chroma_db_file = os.path.join(settings.CHROMA_DB_PATH, "chroma.sqlite3")
31
+ chroma_exists = os.path.exists(settings.CHROMA_DB_PATH) and os.path.exists(chroma_db_file)
32
+
33
+ if chroma_exists:
34
+ logger.info(f"Loading existing ChromaDB from {settings.CHROMA_DB_PATH}")
35
+ try:
36
+ # Load existing vector store
37
+ vector_store = Chroma(
38
+ persist_directory=settings.CHROMA_DB_PATH,
39
+ embedding_function=self.embeddings,
40
+ collection_name=settings.CHROMA_COLLECTION_NAME
41
+ )
42
+
43
+ # Get existing document IDs to check for new documents
44
+ try:
45
+ existing_data = vector_store.get()
46
+ existing_ids = set(existing_data.get('ids', [])) if existing_data else set()
47
+ logger.info(f"Found {len(existing_ids)} existing documents in ChromaDB")
48
+ except Exception as e:
49
+ logger.warning(f"Could not retrieve existing IDs from ChromaDB: {e}. Treating as empty.")
50
+ existing_ids = set()
51
+
52
+ # Filter out documents that already exist (based on content hash)
53
+ new_docs = []
54
+ doc_ids = []
55
+ for doc in docs:
56
+ # Generate a simple ID based on content hash
57
+ doc_id = hashlib.md5(doc.page_content.encode()).hexdigest()
58
+ if doc_id not in existing_ids:
59
+ new_docs.append(doc)
60
+ doc_ids.append(doc_id)
61
+
62
+ if new_docs:
63
+ logger.info(f"Adding {len(new_docs)} new documents to ChromaDB")
64
+ vector_store.add_documents(new_docs, ids=doc_ids)
65
+ vector_store.persist()
66
+ else:
67
+ logger.info("No new documents to add. Using existing ChromaDB.")
68
+ except Exception as e:
69
+ logger.warning(f"Failed to load existing ChromaDB: {e}. Creating new one.")
70
+ # Fall back to creating new DB
71
+ vector_store = Chroma.from_documents(
72
+ documents=docs,
73
+ embedding=self.embeddings,
74
+ persist_directory=settings.CHROMA_DB_PATH,
75
+ collection_name=settings.CHROMA_COLLECTION_NAME
76
+ )
77
+ else:
78
+ logger.info(f"Creating new ChromaDB at {settings.CHROMA_DB_PATH}")
79
+ # Create new Chroma vector store
80
+ vector_store = Chroma.from_documents(
81
+ documents=docs,
82
+ embedding=self.embeddings,
83
+ persist_directory=settings.CHROMA_DB_PATH,
84
+ collection_name=settings.CHROMA_COLLECTION_NAME
85
+ )
86
+ logger.info("Vector store created successfully.")
87
+
88
+ # Create BM25 retriever
89
+ bm25 = BM25Retriever.from_documents(docs)
90
+ logger.info("BM25 retriever created successfully.")
91
+
92
+ # Create vector-based retriever
93
+ vector_retriever = vector_store.as_retriever(search_kwargs={"k": settings.VECTOR_SEARCH_K})
94
+ logger.info("Vector retriever created successfully.")
95
+
96
+ # Combine retrievers into a hybrid retriever
97
+ hybrid_retriever = EnsembleRetriever(
98
+ retrievers=[bm25, vector_retriever],
99
+ weights=settings.HYBRID_RETRIEVER_WEIGHTS
100
+ )
101
+ logger.info("Hybrid retriever created successfully.")
102
+ return hybrid_retriever
103
+ except Exception as e:
104
+ logger.error(f"Failed to build hybrid retriever: {e}")
105
+ raise
test/ocr_test.pdf ADDED
Binary file (93.5 kB). View file
 
test/sample.png ADDED

Git LFS Details

  • SHA256: 6adad9c714bfa9c44eee0a47c4b0eeeaf5592ec50f515cdff977530569e39db5
  • Pointer size: 131 Bytes
  • Size of remote file: 308 kB
test/test1.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter
2
+ from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ import os
5
+
6
+ ### ๐Ÿ”น Docling PDF Parsing
7
+ def parse_with_docling(pdf_path):
8
+ """
9
+ Parses a PDF using Docling, extracts markdown content,
10
+ and prints the full extracted content.
11
+ """
12
+ try:
13
+ # Ensure file exists
14
+ if not os.path.exists(pdf_path):
15
+ raise FileNotFoundError(f"File not found: {pdf_path}")
16
+
17
+ # Initialize Docling Converter
18
+ converter = DocumentConverter()
19
+ markdown_document = converter.convert(pdf_path).document.export_to_markdown()
20
+
21
+ # Define headers to split on (modify as needed)
22
+ headers_to_split_on = [
23
+ ("#", "Header 1"),
24
+ ("##", "Header 2"),
25
+ ("###", "Header 3"),
26
+ ]
27
+
28
+ # Initialize Markdown Splitter
29
+ markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
30
+ docs_list = markdown_splitter.split_text(markdown_document)
31
+
32
+ # Print full extracted sections
33
+ print("\nโœ… Full Extracted Content (Docling):")
34
+ for idx, doc in enumerate(docs_list):
35
+ print(f"\n๐Ÿ”น Section {idx + 1}:\n{doc}\n" + "-"*80)
36
+
37
+ return docs_list
38
+
39
+ except Exception as e:
40
+ print(f"\nโŒ Error during Docling processing: {e}")
41
+ return []
42
+
43
+ ### ๐Ÿ”น LangChain PDF Parsing
44
+ def parse_with_langchain(pdf_path):
45
+ """
46
+ Parses a PDF using LangChain's PyPDFLoader and prints the full extracted text.
47
+ """
48
+ try:
49
+ # Ensure file exists
50
+ if not os.path.exists(pdf_path):
51
+ raise FileNotFoundError(f"File not found: {pdf_path}")
52
+
53
+ # Load PDF using PyPDFLoader
54
+ loader = PyPDFLoader(pdf_path)
55
+ pages = loader.load()
56
+
57
+ # Extract text from all pages
58
+ text = "\n\n".join([page.page_content for page in pages])
59
+
60
+ # Print full extracted content
61
+ print("\nโœ… Full Extracted Content (LangChain):\n")
62
+ print(text)
63
+ print("\n" + "="*100)
64
+
65
+ return text
66
+
67
+ except Exception as e:
68
+ print(f"\nโŒ Error during LangChain processing: {e}")
69
+ return ""
70
+
71
+ ### ๐Ÿ”น Main Execution
72
+ def main():
73
+ ocr_path = "test/ocr_test.pdf"
74
+ scanned_pdf_path = "test/sample.png"
75
+
76
+ print("\n๐Ÿ” Running Docling Extraction for OCR...")
77
+ docling_docs = parse_with_docling(ocr_path)
78
+
79
+ print("\n๐Ÿ” Running LangChain Extraction for OCR...")
80
+ langchain_text = parse_with_langchain(ocr_path)
81
+
82
+ print("\n๐Ÿ” Running Docling Extraction for scanned PDF...")
83
+ docling_docs = parse_with_docling(scanned_pdf_path)
84
+
85
+ print("\n๐Ÿ” Running LangChain Extraction for scanned PDF...")
86
+ langchain_text = parse_with_langchain(scanned_pdf_path)
87
+
88
+ if __name__ == "__main__":
89
+ main()
utils/__init.py__ ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .logging import logger
2
+
3
+ __all__ = ["logger"]
utils/__pycache__/logging.cpython-310.pyc ADDED
Binary file (320 Bytes). View file
 
utils/logging.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+
3
+ logger.add(
4
+ "app.log",
5
+ rotation="10 MB",
6
+ retention="30 days",
7
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
8
+ )
uv.lock ADDED
The diff for this file is too large to render. See raw diff