A7m0d commited on
Commit
7dfe46c
·
verified ·
1 Parent(s): 2f3b05b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitignore +224 -0
  3. README.md +402 -8
  4. assets/bench_korean.csv +52 -0
  5. assets/out_gem_2_flash.txt +702 -0
  6. cache/cache_metadata.db +0 -0
  7. csv_evaluation.py +608 -0
  8. data/.DS_Store +0 -0
  9. data/.gitkeep +5 -0
  10. data/documents/.gitkeep +1 -0
  11. data/metadata.db +0 -0
  12. demo.py +198 -0
  13. direct_rag_loader.py +459 -0
  14. docker-compose.yml +15 -0
  15. embedding_cache/17/17af3460f99996bb3e89f78743fbaa60dba90b6d8cd6da492ced29aa7c4f8bc9.pkl +3 -0
  16. embedding_cache/cache_metadata.db +0 -0
  17. fixed_gradio_demo.py +457 -0
  18. launch_rag_demo.py +37 -0
  19. logger/custom_logger.py +191 -0
  20. logger/logging_config.yaml +30 -0
  21. quick_start.py +262 -0
  22. rag_launcher.py +604 -0
  23. requirements.txt +29 -0
  24. results/evaluation_results_20250912_115542.json +96 -0
  25. setup_local.py +590 -0
  26. src/__init__.py +23 -0
  27. src/clients.py +283 -0
  28. src/config.py +212 -0
  29. src/config.yaml +129 -0
  30. src/document_processor.py +365 -0
  31. src/embedding_system.py +312 -0
  32. src/evaluation_bench/dataset_loader.py +128 -0
  33. src/evaluation_bench/deep_eval.py +59 -0
  34. src/evaluation_bench/evaluator.py +275 -0
  35. src/evaluation_bench/gradio_demo.py +347 -0
  36. src/evaluation_bench/launch_gradio.py +35 -0
  37. src/evaluation_bench/launch_streamlit.py +35 -0
  38. src/evaluation_bench/logger.py +110 -0
  39. src/evaluation_bench/main.py +111 -0
  40. src/evaluation_bench/run_evaluation.py +37 -0
  41. src/evaluation_bench/streamlit_demo.py +424 -0
  42. src/evaluation_bench/visualization.py +347 -0
  43. src/evaluation_config.yaml +72 -0
  44. src/excel_processor.py +628 -0
  45. src/gradio_demo.py +676 -0
  46. src/groq_client.py +378 -0
  47. src/image_processor.py +538 -0
  48. src/ingestion_pipeline.py +505 -0
  49. src/metadata_manager.py +518 -0
  50. src/pdf_processor.py +495 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ *.excel
10
+ *.pdf
11
+ *.xlsx
12
+
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py.cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ # Pipfile.lock
101
+
102
+ # UV
103
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ # uv.lock
107
+
108
+ # poetry
109
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113
+ # poetry.lock
114
+ # poetry.toml
115
+
116
+ # pdm
117
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
119
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
120
+ # pdm.lock
121
+ # pdm.toml
122
+ .pdm-python
123
+ .pdm-build/
124
+
125
+ # pixi
126
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
127
+ # pixi.lock
128
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
129
+ # in the .venv directory. It is recommended not to include this directory in version control.
130
+ .pixi
131
+
132
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133
+ __pypackages__/
134
+
135
+ # Celery stuff
136
+ celerybeat-schedule
137
+ celerybeat.pid
138
+
139
+ # Redis
140
+ *.rdb
141
+ *.aof
142
+ *.pid
143
+ .deepeval
144
+ .gradio
145
+ .kiro
146
+
147
+ # RabbitMQ
148
+ mnesia/
149
+ rabbitmq/
150
+ rabbitmq-data/
151
+
152
+ # ActiveMQ
153
+ activemq-data/
154
+
155
+ # SageMath parsed files
156
+ *.sage.py
157
+
158
+ # Environments
159
+ .env
160
+ .envrc
161
+ .venv
162
+ env/
163
+ venv/
164
+ ENV/
165
+ env.bak/
166
+ venv.bak/
167
+
168
+ # Spyder project settings
169
+ .spyderproject
170
+ .spyproject
171
+
172
+ # Rope project settings
173
+ .ropeproject
174
+
175
+ # mkdocs documentation
176
+ /site
177
+
178
+ # mypy
179
+ .mypy_cache/
180
+ .dmypy.json
181
+ dmypy.json
182
+
183
+ # Pyre type checker
184
+ .pyre/
185
+
186
+ # pytype static type analyzer
187
+ .pytype/
188
+
189
+ # Cython debug symbols
190
+ cython_debug/
191
+
192
+ # PyCharm
193
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
194
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
195
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
196
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
197
+ # .idea/
198
+
199
+ # Abstra
200
+ # Abstra is an AI-powered process automation framework.
201
+ # Ignore directories containing user credentials, local state, and settings.
202
+ # Learn more at https://abstra.io/docs
203
+ .abstra/
204
+
205
+ # Visual Studio Code
206
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
207
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
208
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
209
+ # you could uncomment the following to ignore the entire vscode folder
210
+ # .vscode/
211
+
212
+ # Ruff stuff:
213
+ .ruff_cache/
214
+
215
+ # PyPI configuration file
216
+ .pypirc
217
+
218
+ # Marimo
219
+ marimo/_static/
220
+ marimo/_lsp/
221
+ __marimo__/
222
+
223
+ # Streamlit
224
+ .streamlit/secrets.toml
README.md CHANGED
@@ -1,12 +1,406 @@
1
  ---
2
- title: Rag Korean Manufacturing Docs
3
- emoji: 🌖
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.46.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: rag_korean_manufacturing_docs
3
+ app_file: fixed_gradio_demo.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.39.0
 
 
6
  ---
7
+ # 🏭 Manufacturing RAG Agent
8
 
9
+ A comprehensive Retrieval-Augmented Generation (RAG) system designed specifically for manufacturing document analysis. This system can process PDFs, Excel files with embedded images, and standalone images to provide accurate answers with complete citations and metadata tracking.
10
+
11
+ ## ✨ Features
12
+
13
+ ### 📄 Multi-Format Document Processing
14
+ - **PDF Documents**: Text extraction, table detection, and embedded image processing
15
+ - **Excel Files**: Worksheet data extraction, embedded image processing, and table detection
16
+ - **Images**: OCR text extraction with preprocessing for improved accuracy
17
+ - **Metadata Preservation**: Complete citation tracking with page numbers, worksheet names, and cell ranges
18
+
19
+ ### 🧠 Advanced RAG Capabilities
20
+ - **Semantic Search**: Vector-based similarity search using Qdrant
21
+ - **Reranking**: Improved relevance using Silicon Flow's Qwen3 reranker
22
+ - **Fast LLM Inference**: Sub-second response times using Groq's LPU architecture
23
+ - **Citation Generation**: Automatic source attribution with confidence scores
24
+
25
+ ### 🔧 Production-Ready Features
26
+ - **Scalable Architecture**: Designed to handle up to 1TB of manufacturing data
27
+ - **Incremental Processing**: Efficient updates without reprocessing existing data
28
+ - **Comprehensive Monitoring**: Health checks, statistics, and performance metrics
29
+ - **Interactive Demo**: Streamlit-based web interface for easy testing
30
+
31
+ ## 🏗️ Architecture
32
+
33
+ ```mermaid
34
+ graph TB
35
+ subgraph "User Interface"
36
+ UI[Streamlit Demo]
37
+ API[REST API]
38
+ end
39
+
40
+ subgraph "RAG Engine"
41
+ QA[Question Answering]
42
+ RET[Document Retrieval]
43
+ RANK[Reranking]
44
+ end
45
+
46
+ subgraph "Processing Pipeline"
47
+ DOC[Document Processor]
48
+ EMB[Embedding Generator]
49
+ OCR[Image OCR]
50
+ end
51
+
52
+ subgraph "Storage Layer"
53
+ VDB[(Qdrant Vector DB)]
54
+ MDB[(SQLite Metadata)]
55
+ FS[(File Storage)]
56
+ end
57
+
58
+ subgraph "External APIs"
59
+ GROQ[Groq LLM API]
60
+ SF[Silicon Flow API]
61
+ end
62
+
63
+ UI --> QA
64
+ QA --> RET
65
+ RET --> RANK
66
+ RANK --> GROQ
67
+
68
+ DOC --> EMB
69
+ DOC --> OCR
70
+ EMB --> SF
71
+ OCR --> SF
72
+
73
+ EMB --> VDB
74
+ DOC --> MDB
75
+ DOC --> FS
76
+ ```
77
+
78
+ ## 🚀 Quick Start
79
+
80
+ ### Prerequisites
81
+
82
+ 1. **Python 3.8+**
83
+ 2. **API Keys**:
84
+ - Groq API key for LLM inference
85
+ - Silicon Flow API key for embeddings and reranking
86
+ - Qdrant instance (local or cloud)
87
+
88
+ 3. **System Dependencies**:
89
+ - Tesseract OCR for image processing
90
+ - PyMuPDF for PDF processing
91
+
92
+ ### Installation
93
+
94
+ 1. **Clone the repository**:
95
+ ```bash
96
+ git clone <repository-url>
97
+ cd manufacturing-rag-agent
98
+ ```
99
+
100
+ 2. **Install dependencies**:
101
+ ```bash
102
+ pip install -r requirements.txt
103
+ ```
104
+
105
+ 3. **Install system dependencies**:
106
+
107
+ **macOS**:
108
+ ```bash
109
+ brew install tesseract
110
+ ```
111
+
112
+ **Ubuntu/Debian**:
113
+ ```bash
114
+ sudo apt-get install tesseract-ocr
115
+ ```
116
+
117
+ **Windows**:
118
+ Download and install from [Tesseract GitHub](https://github.com/UB-Mannheim/tesseract/wiki)
119
+
120
+ 4. **Set up environment variables**:
121
+ ```bash
122
+ cp .env.example .env
123
+ # Edit .env with your API keys
124
+ ```
125
+
126
+ 5. **Configure Qdrant**:
127
+
128
+ **Local Qdrant (Docker)**:
129
+ ```bash
130
+ docker run -p 6333:6333 qdrant/qdrant
131
+ ```
132
+
133
+ **Or use Qdrant Cloud** and update the URL in `.env`
134
+
135
+ ### Configuration
136
+
137
+ Edit `src/config.yaml` to customize the system:
138
+
139
+ ```yaml
140
+ # RAG System Configuration
141
+ rag_system:
142
+ embedding_model: "qwen3-embedding"
143
+ reranker_model: "qwen3-reranker"
144
+ llm_model: "openai/gpt-oss-120b"
145
+ chunk_size: 512
146
+ chunk_overlap: 50
147
+ max_context_chunks: 5
148
+ similarity_threshold: 0.7
149
+
150
+ # Document Processing
151
+ document_processing:
152
+ pdf_engine: "pymupdf"
153
+ excel_engine: "openpyxl"
154
+ ocr_engine: "tesseract"
155
+ image_processing: true
156
+ table_extraction: true
157
+ max_file_size_mb: 100
158
+
159
+ # Storage Configuration
160
+ storage:
161
+ qdrant_collection: "manufacturing_docs"
162
+ metadata_db_path: "./data/metadata.db"
163
+ file_storage_path: "./data/documents"
164
+ ```
165
+
166
+ ### Running the Demo
167
+
168
+ Launch the Streamlit demo:
169
+
170
+ ```bash
171
+ python launch_rag_demo.py
172
+ ```
173
+
174
+ Or run directly:
175
+
176
+ ```bash
177
+ streamlit run src/rag_demo.py
178
+ ```
179
+
180
+ The demo will be available at `http://localhost:8501`
181
+
182
+ ## 📖 Usage Guide
183
+
184
+ ### 1. Document Upload
185
+
186
+ 1. Navigate to the "📄 Document Upload" page
187
+ 2. Upload your manufacturing documents (PDF, Excel, or images)
188
+ 3. Click "Process Documents" to ingest them into the system
189
+ 4. Monitor processing progress and results
190
+
191
+ ### 2. Asking Questions
192
+
193
+ 1. Go to the "❓ Ask Questions" page
194
+ 2. Enter your question about the manufacturing data
195
+ 3. Optionally configure advanced settings:
196
+ - Number of context chunks
197
+ - Similarity threshold
198
+ - Document type filters
199
+ 4. View the answer with detailed citations
200
+
201
+ ### 3. Analytics
202
+
203
+ 1. Visit the "📊 Analytics" page to view:
204
+ - Document processing statistics
205
+ - Document type distribution
206
+ - Processing status overview
207
+ - Recent activity
208
+
209
+ ### 4. System Monitoring
210
+
211
+ 1. Check the "⚙️ System Status" page for:
212
+ - Component health checks
213
+ - Configuration details
214
+ - Performance metrics
215
+
216
+ ## 🔧 API Usage
217
+
218
+ ### Document Ingestion
219
+
220
+ ```python
221
+ from src.rag.ingestion_pipeline import DocumentIngestionPipeline
222
+
223
+ # Initialize pipeline
224
+ config = {...} # Your configuration
225
+ pipeline = DocumentIngestionPipeline(config)
226
+
227
+ # Ingest single document
228
+ result = pipeline.ingest_document("path/to/document.pdf")
229
+
230
+ # Batch ingestion
231
+ results = pipeline.ingest_batch([
232
+ "path/to/doc1.pdf",
233
+ "path/to/doc2.xlsx",
234
+ "path/to/image.png"
235
+ ])
236
+ ```
237
+
238
+ ### Question Answering
239
+
240
+ ```python
241
+ from src.rag.rag_engine import RAGEngine
242
+
243
+ # Initialize RAG engine
244
+ rag_engine = RAGEngine(config)
245
+
246
+ # Ask a question
247
+ response = rag_engine.answer_question(
248
+ "What is the average production yield for Q3?"
249
+ )
250
+
251
+ print(f"Answer: {response.answer}")
252
+ print(f"Confidence: {response.confidence_score}")
253
+ print(f"Sources: {len(response.citations)}")
254
+
255
+ # View citations
256
+ for citation in response.citations:
257
+ print(f"Source: {citation.source_file}")
258
+ if citation.page_number:
259
+ print(f"Page: {citation.page_number}")
260
+ if citation.worksheet_name:
261
+ print(f"Sheet: {citation.worksheet_name}")
262
+ ```
263
+
264
+ ## 🧪 Testing
265
+
266
+ Run the test suite:
267
+
268
+ ```bash
269
+ # Run all tests
270
+ pytest
271
+
272
+ # Run specific test modules
273
+ pytest src/tests/test_document_processor.py
274
+ pytest src/tests/test_rag_system.py
275
+
276
+ # Run with coverage
277
+ pytest --cov=src --cov-report=html
278
+ ```
279
+
280
+ ## 📊 Performance
281
+
282
+ ### Benchmarks
283
+
284
+ - **Document Processing**:
285
+ - PDF: ~2-5 seconds per page
286
+ - Excel: ~1-3 seconds per worksheet
287
+ - Images: ~1-2 seconds per image (with OCR)
288
+
289
+ - **Query Response Time**:
290
+ - Vector Search: ~100-300ms
291
+ - Reranking: ~200-500ms
292
+ - LLM Generation: ~500-1500ms
293
+ - **Total**: ~1-3 seconds per query
294
+
295
+ - **Scalability**:
296
+ - Tested with up to 10,000 documents
297
+ - Supports concurrent processing
298
+ - Memory-efficient chunking strategy
299
+
300
+ ### Optimization Tips
301
+
302
+ 1. **Batch Processing**: Process multiple documents together for better throughput
303
+ 2. **Chunk Size**: Adjust chunk size based on your document types
304
+ 3. **Embedding Cache**: Enable caching for repeated content
305
+ 4. **Qdrant Optimization**: Use appropriate vector size and distance metrics
306
+
307
+ ## 🔒 Security Considerations
308
+
309
+ - **API Keys**: Store securely in environment variables
310
+ - **File Validation**: Automatic file type and size validation
311
+ - **Input Sanitization**: All user inputs are sanitized
312
+ - **Access Control**: Implement authentication for production use
313
+ - **Data Privacy**: Consider data residency requirements for cloud APIs
314
+
315
+ ## 🛠️ Troubleshooting
316
+
317
+ ### Common Issues
318
+
319
+ 1. **Tesseract Not Found**:
320
+ ```bash
321
+ # Install Tesseract OCR
322
+ brew install tesseract # macOS
323
+ sudo apt-get install tesseract-ocr # Ubuntu
324
+ ```
325
+
326
+ 2. **Qdrant Connection Failed**:
327
+ - Check if Qdrant is running: `curl http://localhost:6333/health`
328
+ - Verify URL and API key in `.env`
329
+
330
+ 3. **API Rate Limits**:
331
+ - Check your API quotas
332
+ - Implement exponential backoff (already included)
333
+
334
+ 4. **Memory Issues**:
335
+ - Reduce batch size in configuration
336
+ - Process documents individually for large files
337
+
338
+ 5. **Slow Performance**:
339
+ - Check network connectivity to APIs
340
+ - Monitor Qdrant performance
341
+ - Consider local embedding models for high-volume use
342
+
343
+ ### Debug Mode
344
+
345
+ Enable debug logging:
346
+
347
+ ```python
348
+ import logging
349
+ logging.basicConfig(level=logging.DEBUG)
350
+ ```
351
+
352
+ Or set environment variable:
353
+ ```bash
354
+ export DEBUG=true
355
+ ```
356
+
357
+ ## 🤝 Contributing
358
+
359
+ 1. Fork the repository
360
+ 2. Create a feature branch: `git checkout -b feature-name`
361
+ 3. Make your changes and add tests
362
+ 4. Run the test suite: `pytest`
363
+ 5. Submit a pull request
364
+
365
+ ### Development Setup
366
+
367
+ ```bash
368
+ # Install development dependencies
369
+ pip install -r requirements-dev.txt
370
+
371
+ # Run pre-commit hooks
372
+ pre-commit install
373
+
374
+ # Run linting
375
+ flake8 src/
376
+ black src/
377
+
378
+ # Run type checking
379
+ mypy src/
380
+ ```
381
+
382
+ ## 📄 License
383
+
384
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
385
+
386
+ ## 🙏 Acknowledgments
387
+
388
+ - **Groq** for fast LLM inference
389
+ - **Silicon Flow** for embedding and reranking APIs
390
+ - **Qdrant** for vector database capabilities
391
+ - **Streamlit** for the interactive demo interface
392
+ - **PyMuPDF** for PDF processing
393
+ - **OpenPyXL** for Excel file handling
394
+ - **Tesseract** for OCR capabilities
395
+
396
+ ## 📞 Support
397
+
398
+ For questions, issues, or feature requests:
399
+
400
+ 1. Check the [Issues](../../issues) page
401
+ 2. Review the [Troubleshooting](#-troubleshooting) section
402
+ 3. Create a new issue with detailed information
403
+
404
+ ---
405
+
406
+ **Built with ❤️ for manufacturing excellence**
assets/bench_korean.csv ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,input,expected_output
2
+ ,"""이번 달 우리 회사 전체 매출은 얼마야?""","A: 2025년 1월 삼광 Global 전체 매출은 335.4억원입니다. 이는 당초 사업계획(213.4억원) 대비 57% 초과 달성한 수치이며, 실행계획(307.8억원) 대비도 109% 달성한 성과입니다."
3
+ ,"""사업부별 매출 비중이 어떻게 되나요?""","A: 2025년 1월 기준 사업부별 매출 비중은 다음과 같습니다:
4
+
5
+ 한국 사업부: 213.0억원 (39.7%)
6
+ 베트남 사업부: 38.6억원 (44.1%)
7
+ 인도 사업부: 미미한 수준
8
+ 윈테크: 미미한 수준 한국과 베트남 사업부가 전체 매출의 약 84%를 차지하고 있습니다.
9
+ "
10
+ ,"""우리 회사 영업이익률은 몇 %야?""","A: 2025년 1월 전사 영업이익률은 3%입니다. 영업이익은 8.97억원이며, 사업부별로는 한국 4%, 베트남 2%, 윈테크는 -7%의 영업이익률을 기록했습니다.
11
+
12
+ 생산/품질 관련 질문"
13
+ ,"""TAB S10 도장 공정 수율이 어떻게 되나요?""","A: TAB S10 제품의 도장 공정 수율은 평균 98%로 매우 양호합니다. 세부적으로 TAB S10 REAR BODY 도장은 98%, TAB S10 KNOB 도장은 99%의 수율을 보이고 있습니다."
14
+ ,"""최근 수율이 낮은 공정이 있나요?""","A: 네, 몇 가지 주의가 필요한 공정이 있습니다:
15
+
16
+ R47 ENCLOSURE, LOWER, BATTERY, LARGE 사출: 59%
17
+ R47 ARM, FRONT RIGHT, UPPER 사출: 80%
18
+ Tab S10 FE FRONT BODY 사출: 87% 이 공정들은 90% 미만의 수율로 개선이 필요합니다."
19
+ ,"""삼성 폴더블폰 부품(SM-F 시리즈) 생산 현황은?""","A: 삼성 폴더블폰 부품 생산이 활발합니다:
20
+
21
+ SM-F721U: FRONT DECO MAIN/SUB NC 공정 수율 96-97%
22
+ SM-F731U: NC 공정 수율 97%, 조립 수율 100%
23
+ SM-F741U: NC 공정 수율 95%, 레이저 공정 수율 99%
24
+ SM-F936U: NC 및 조립 공정 모두 100% 수율 달성
25
+ 재고 관련 질문"
26
+ ,"""R47 시리즈 재고 현황이 어떻게 되나요?""","A: R47 시리즈 주요 품목 재고 현황:
27
+
28
+ R47 ENCLOSURE, LOWER, BATTERY, LARGE 가공품: 568 EA (양품)
29
+ 기타 R47 부품들은 현재 재고가 없는 상태입니다.
30
+ 대부분 게이트 커팅 가공이나 사출 공정을 거치는 부품들입니다."
31
+ ,"""C18 제품군 재고가 있나요?""","A: C18 제품군은 모두 재고가 0인 상태입니다. CLAMSHELL COVER, ENCLOSURE 등 주요 부품들이 재고 소진 상태이므로 생산 계획 수립이 필요합니다.
32
+
33
+ 원가 관련 질문"
34
+ ,"""우리 회사 매출원가율이 높은 이유가 뭐야?""","A: 2025년 1월 전사 매출원가율은 92%로 매우 높습니다. 주요 원인은:
35
+
36
+ 매입비(원부자재+외주가공비): 67% - 가장 큰 비중
37
+ 노무비: 12%
38
+ 제조경비: 11% 특히 베트남 사업부(94%)와 인도 사업부(92%)의 매출원가율이 높아 수익성 개선이 시급합니다."
39
+ ,"""실패비용이 얼마나 발생했나요?""","A: 2025년 1월 전사 실패비용은 5.16억원(매출 대비 2%)입니다. 사업부별로는:
40
+
41
+ 한국: 0.23억원 (1%)
42
+ 베트남: 3.95억원 (2%) - 가장 높음
43
+ 인도: 0.48억원 (1%)
44
+ 윈테크: 0.50억원 (1%) 베트남 사업부의 실패비용 절감이 필요합니다.
45
+ 작업 관련 질문"
46
+ ,"""SMF741UB6 조립 작업 시 주의사항이 뭐야?""",A: SMF741UB6 FRONT DECO SUB 조립 작업표준서에 따른 주요 주의사항을 확인해야 합니다. 2024년 7월 8일에 조립 부분이 수정된 최신 버전을 참고하시기 바랍니다.
47
+ ,"""이번 달 생산성이 가장 좋은 공정은?""","A: 다음 공정들이 100% 수율을 달성했습니다:
48
+
49
+ SM-F936U NC 및 조립 공정
50
+ C18 SHIM 가공 및 사출
51
+ PA3 DECO 아노다이징, 샌딩, 버핑
52
+ 대부분의 조립(ASS'Y) 공정 이들 공정은 벤치마킹 대상으로 삼을 수 있습니다."
assets/out_gem_2_flash.txt ADDED
@@ -0,0 +1,702 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```json
2
+ ✨ You're running DeepEval's latest Answer Relevancy Metric! (using gemini-2.0-flash, strict=False,
3
+ async_mode=True)...
4
+ **************************************************
5
+ Answer Relevancy Verbose Logs
6
+ **************************************************
7
+
8
+ Statements:
9
+ [
10
+ "2025년 1월 삼광 Global 전체 매출은 335.4억원입니다.",
11
+ "이는 당초 사업계획(213.4억원) 대비 57% 초과 달성한 수치입니다.",
12
+ "실행계획(307.8억원) 대비도 109% 달성한 성과입니다."
13
+ ]
14
+
15
+ Verdicts:
16
+ [
17
+ {
18
+ "verdict": "yes",
19
+ "reason": null
20
+ },
21
+ {
22
+ "verdict": "yes",
23
+ "reason": null
24
+ },
25
+ {
26
+ "verdict": "yes",
27
+ "reason": null
28
+ }
29
+ ]
30
+
31
+ Score: 1.0
32
+ Reason: The score is 1.00 because there were no irrelevant statements, great job!
33
+
34
+ ======================================================================
35
+ **************************************************
36
+ Answer Relevancy Verbose Logs
37
+ **************************************************
38
+
39
+ Statements:
40
+ [
41
+ "2025년 1월 기준 사업부별 매출 비중",
42
+ "한국 사업부: 213.0억원 (39.7%)",
43
+ "베트남 사업부: 38.6억원 (44.1%)",
44
+ "인도 사업부: 미미한 수준",
45
+ "윈테크: 미미한 수준",
46
+ "한국과 베트남 사업부가 전체 매출의 약 84%를 차지하고 있습니다."
47
+ ]
48
+
49
+ Verdicts:
50
+ [
51
+ {
52
+ "verdict": "yes",
53
+ "reason": null
54
+ },
55
+ {
56
+ "verdict": "yes",
57
+ "reason": null
58
+ },
59
+ {
60
+ "verdict": "yes",
61
+ "reason": null
62
+ },
63
+ {
64
+ "verdict": "yes",
65
+ "reason": null
66
+ },
67
+ {
68
+ "verdict": "yes",
69
+ "reason": null
70
+ },
71
+ {
72
+ "verdict": "yes",
73
+ "reason": null
74
+ }
75
+ ]
76
+
77
+ Score: 1.0
78
+ Reason: The score is 1.00 because there were no irrelevant statements, indicating perfect relevancy! Great job!
79
+
80
+ ======================================================================
81
+ **************************************************
82
+ Answer Relevancy Verbose Logs
83
+ **************************************************
84
+
85
+ Statements:
86
+ [
87
+ "네, 몇 가지 주의가 필요한 공정이 있습니다",
88
+ "R47 ENCLOSURE, LOWER, BATTERY, LARGE 사출: 59%",
89
+ "R47 ARM, FRONT RIGHT, UPPER 사출: 80%",
90
+ "Tab S10 FE FRONT BODY 사출: 87%",
91
+ "이 공정들은 90% 미만의 수율로 개선이 필요합니다"
92
+ ]
93
+
94
+ Verdicts:
95
+ [
96
+ {
97
+ "verdict": "yes",
98
+ "reason": null
99
+ },
100
+ {
101
+ "verdict": "yes",
102
+ "reason": null
103
+ },
104
+ {
105
+ "verdict": "yes",
106
+ "reason": null
107
+ },
108
+ {
109
+ "verdict": "yes",
110
+ "reason": null
111
+ },
112
+ {
113
+ "verdict": "yes",
114
+ "reason": null
115
+ }
116
+ ]
117
+
118
+ Score: 1.0
119
+ Reason: The score is 1.00 because there were no irrelevant statements, indicating perfect relevancy! Great job!
120
+
121
+ ======================================================================
122
+ **************************************************
123
+ Answer Relevancy Verbose Logs
124
+ **************************************************
125
+
126
+ Statements:
127
+ [
128
+ "2025년 1월 전사 매출원가율은 92%로 매우 높습니다.",
129
+ "매입비(원부자재+외주가공비): 67% - 가장 큰 비중",
130
+ "노무비: 12%",
131
+ "제조경비: 11%",
132
+ "베트남 사업부(94%)와 인도 사업부(92%)의 매출원가율이 높아 수익성 개선이 시급합니다."
133
+ ]
134
+
135
+ Verdicts:
136
+ [
137
+ {
138
+ "verdict": "yes",
139
+ "reason": null
140
+ },
141
+ {
142
+ "verdict": "yes",
143
+ "reason": null
144
+ },
145
+ {
146
+ "verdict": "yes",
147
+ "reason": null
148
+ },
149
+ {
150
+ "verdict": "yes",
151
+ "reason": null
152
+ },
153
+ {
154
+ "verdict": "yes",
155
+ "reason": null
156
+ }
157
+ ]
158
+
159
+ Score: 1.0
160
+ Reason: The answer relevancy score is perfect at 1.00! Great job maintaining focus and relevance.
161
+
162
+ ======================================================================
163
+ **************************************************
164
+ **************************************************
165
+
166
+ Statements:
167
+ [
168
+ "SMF741UB6 FRONT DECO SUB 조립 작업표준서에 따른 주요 주의사항을 확인해야 합니다.",
169
+ "2024년 7월 8일에 조립 부분이 수정된 최신 버전을 참고하시기 바랍니다."
170
+ ]
171
+
172
+ Verdicts:
173
+ [
174
+ {
175
+ "verdict": "yes",
176
+ "reason": null
177
+ },
178
+ {
179
+ "verdict": "yes",
180
+ "reason": null
181
+ }
182
+ ]
183
+
184
+ Score: 1.0
185
+ Reason: The score is 1.00 because there were no irrelevant statements, indicating perfect alignment with the
186
+ user's query! Great job!
187
+
188
+ ======================================================================
189
+ **************************************************
190
+ Answer Relevancy Verbose Logs
191
+ **************************************************
192
+
193
+ Statements:
194
+ [
195
+ "TAB S10 제품의 도장 공정 수율은 평균 98%로 매우 양호합니다.",
196
+ "TAB S10 REAR BODY 도장은 98%의 수율을 보이고 있습니다.",
197
+ "TAB S10 KNOB 도장은 99%의 ��율을 보이고 있습니다."
198
+ ]
199
+
200
+ Verdicts:
201
+ [
202
+ {
203
+ "verdict": "yes",
204
+ "reason": null
205
+ },
206
+ {
207
+ "verdict": "yes",
208
+ "reason": null
209
+ },
210
+ {
211
+ "verdict": "yes",
212
+ "reason": null
213
+ }
214
+ ]
215
+
216
+ Score: 1.0
217
+ Reason: The score is 1.00 because there were no irrelevant statements, indicating perfect relevancy! Great job!
218
+
219
+ ======================================================================
220
+ **************************************************
221
+ Answer Relevancy Verbose Logs
222
+ **************************************************
223
+
224
+ Statements:
225
+ [
226
+ "2025년 1월 전사 영업이익률은 3%입니다.",
227
+ "영업이익은 8.97억원입니다.",
228
+ "사업부별로는 한국 4%입니다.",
229
+ "베트남 2%입니다.",
230
+ "윈테크는 -7%의 영업이익률을 기록했습니다.",
231
+ "생산/품질 관련 질문"
232
+ ]
233
+
234
+ Verdicts:
235
+ [
236
+ {
237
+ "verdict": "yes",
238
+ "reason": null
239
+ },
240
+ {
241
+ "verdict": "yes",
242
+ "reason": null
243
+ },
244
+ {
245
+ "verdict": "yes",
246
+ "reason": null
247
+ },
248
+ {
249
+ "verdict": "yes",
250
+ "reason": null
251
+ },
252
+ {
253
+ "verdict": "yes",
254
+ "reason": null
255
+ },
256
+ {
257
+ "verdict": "no",
258
+ "reason": "This statement is about the topic of the question, not the \uc601\uc5c5\uc774\uc775\ub960."
259
+ }
260
+ ]
261
+
262
+ Score: 0.8333333333333334
263
+ Reason: The score is 0.83 because the response contains a statement about the topic of the question, rather
264
+ than directly addressing the 영업이익률, making it partially irrelevant.
265
+
266
+ ======================================================================
267
+ **************************************************
268
+ Answer Relevancy Verbose Logs
269
+ **************************************************
270
+
271
+ Statements:
272
+ [
273
+ "2025년 1월 전사 실패비용은 5.16억원(매출 대비 2%)입니다.",
274
+ "한국: 0.23억원 (1%)",
275
+ "베트남: 3.95억원 (2%) - 가장 높음",
276
+ "인도: 0.48억원 (1%)",
277
+ "윈테크: 0.50억원 (1%)",
278
+ "베트남 사업부의 실패비용 절감이 필요합니다.",
279
+ "작업 관련 질문"
280
+ ]
281
+
282
+ Verdicts:
283
+ [
284
+ {
285
+ "verdict": "yes",
286
+ "reason": null
287
+ },
288
+ {
289
+ "verdict": "yes",
290
+ "reason": null
291
+ },
292
+ {
293
+ "verdict": "yes",
294
+ "reason": null
295
+ },
296
+ {
297
+ "verdict": "yes",
298
+ "reason": null
299
+ },
300
+ {
301
+ "verdict": "yes",
302
+ "reason": null
303
+ },
304
+ {
305
+ "verdict": "yes",
306
+ "reason": null
307
+ },
308
+ {
309
+ "verdict": "no",
310
+ "reason": "This statement is a question about work, not about the cost of failure."
311
+ }
312
+ ]
313
+
314
+ Score: 0.8571428571428571
315
+ Reason: The score is 0.86 because there was a question about work that was irrelevant to the cost of failure,
316
+ but overall the response was still pretty relevant!
317
+
318
+ ======================================================================
319
+ **************************************************
320
+ Answer Relevancy Verbose Logs
321
+ **************************************************
322
+
323
+ Statements:
324
+ [
325
+ "다음 공정들이 100% 수율을 달성했습니다",
326
+ "SM-F936U NC 및 조립 공정",
327
+ "C18 SHIM 가공 및 사출",
328
+ "PA3 DECO 아노다이징, 샌딩, 버핑",
329
+ "대부분의 조립(ASS'Y) 공정",
330
+ "이들 공정은 벤치마킹 대상으로 삼을 수 있습니다"
331
+ ]
332
+
333
+ Verdicts:
334
+ [
335
+ {
336
+ "verdict": "idk",
337
+ "reason": null
338
+ },
339
+ {
340
+ "verdict": "yes",
341
+ "reason": null
342
+ },
343
+ {
344
+ "verdict": "yes",
345
+ "reason": null
346
+ },
347
+ {
348
+ "verdict": "yes",
349
+ "reason": null
350
+ },
351
+ {
352
+ "verdict": "yes",
353
+ "reason": null
354
+ },
355
+ {
356
+ "verdict": "idk",
357
+ "reason": null
358
+ }
359
+ ]
360
+
361
+ Score: 1.0
362
+ Reason: The answer relevancy score is perfect! Great job!
363
+
364
+ ======================================================================
365
+ **************************************************
366
+ Answer Relevancy Verbose Logs
367
+ **************************************************
368
+
369
+ Statements:
370
+ [
371
+ "C18 제품군은 모두 재고가 0인 상태입니다.",
372
+ "CLAMSHELL COVER, ENCLOSURE 등 주요 부품들이 재고 소진 상태이므로 생산 계획 수립이 필요합니다.",
373
+ "원가 관련 질문"
374
+ ]
375
+
376
+ Verdicts:
377
+ [
378
+ {
379
+ "verdict": "yes",
380
+ "reason": null
381
+ },
382
+ {
383
+ "verdict": "idk",
384
+ "reason": null
385
+ },
386
+ {
387
+ "verdict": "no",
388
+ "reason": "The statement is about cost, not about the availability of C18 products."
389
+ }
390
+ ]
391
+
392
+ Score: 0.6666666666666666
393
+ Reason: The score is 0.67 because the response contains information about cost, which is not directly related
394
+ to the question about the availability of C18 products. However, it still addresses the general topic of C18
395
+ products, hence the non-zero score.
396
+
397
+ ======================================================================
398
+ **************************************************
399
+ Answer Relevancy Verbose Logs
400
+ **************************************************
401
+
402
+ Statements:
403
+ [
404
+ "삼성 폴더블폰 부품 생산이 활발합니다",
405
+ "SM-F721U: FRONT DECO MAIN/SUB NC 공정 수율 96-97%",
406
+ "SM-F731U: NC 공정 수율 97%, 조립 수율 100%",
407
+ "SM-F741U: NC 공정 수율 95%, 레이저 공정 수율 99%",
408
+ "SM-F936U: NC 및 조립 공정 모두 100% 수율 달성",
409
+ "재고 관련 질문"
410
+ ]
411
+
412
+ Verdicts:
413
+ [
414
+ {
415
+ "verdict": "yes",
416
+ "reason": null
417
+ },
418
+ {
419
+ "verdict": "yes",
420
+ "reason": null
421
+ },
422
+ {
423
+ "verdict": "yes",
424
+ "reason": null
425
+ },
426
+ {
427
+ "verdict": "yes",
428
+ "reason": null
429
+ },
430
+ {
431
+ "verdict": "yes",
432
+ "reason": null
433
+ },
434
+ {
435
+ "verdict": "no",
436
+ "reason": "The statement is about inventory, which is not directly related to the production status of
437
+ Samsung foldable phone parts."
438
+ }
439
+ ]
440
+
441
+ Score: 0.8333333333333334
442
+ Reason: The score is 0.83 because the output contains information about inventory, which is not directly
443
+ related to the production status of Samsung foldable phone parts, but it still provides some relevant
444
+ information.
445
+
446
+ ======================================================================
447
+ **************************************************
448
+ Answer Relevancy Verbose Logs
449
+ **************************************************
450
+
451
+ Statements:
452
+ [
453
+ "R47 시리즈 주요 품목 재고 현황:",
454
+ "R47 ENCLOSURE, LOWER, BATTERY, LARGE 가공품: 568 EA (양품)",
455
+ "기타 R47 부품들은 현재 재고가 없는 상태입니다.",
456
+ "대부분 게이트 커팅 가공이나 사출 공정을 거치는 부품들입니다."
457
+ ]
458
+
459
+ Verdicts:
460
+ [
461
+ {
462
+ "verdict": "yes",
463
+ "reason": null
464
+ },
465
+ {
466
+ "verdict": "yes",
467
+ "reason": null
468
+ },
469
+ {
470
+ "verdict": "yes",
471
+ "reason": null
472
+ },
473
+ {
474
+ "verdict": "idk",
475
+ "reason": null
476
+ }
477
+ ]
478
+
479
+ Score: 1.0
480
+ Reason: The answer relevancy score is 1.00 because there were no irrelevant statements, indicating perfect
481
+ alignment with the input! Great job!
482
+
483
+ ======================================================================
484
+
485
+
486
+ ======================================================================
487
+
488
+ Metrics Summary
489
+
490
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 1.00 because there were no irrelevant statements, great job!, error: None)
491
+
492
+ For test case:
493
+
494
+ - input: "이번 달 우리 회사 전체 매출은 얼마야?"
495
+ - actual output: A: 2025년 1월 삼광 Global 전체 매출은 335.4억원입니다. 이는 당초 사업계획(213.4억원) 대비 57% 초과 달성한 수치이며, 실행계획(307.8억원) 대비도 109% 달성한 성과입니다.
496
+ - expected output: None
497
+ - context: []
498
+ - retrieval context: []
499
+
500
+ ======================================================================
501
+
502
+ Metrics Summary
503
+
504
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 1.00 because there were no irrelevant statements, indicating perfect relevancy! Great job!, error: None)
505
+
506
+ For test case:
507
+
508
+ - input: "사업부별 매출 비중이 어떻게 되나요?"
509
+ - actual output: A: 2025년 1월 기준 사업부별 매출 비중은 다음과 같습니다:
510
+
511
+ 한국 사업부: 213.0억원 (39.7%)
512
+ 베트남 사업부: 38.6억원 (44.1%)
513
+ 인도 사업부: 미미한 수준
514
+ 윈테크: 미미한 수준 한국과 베트남 사업부가 전체 매출의 약 84%를 차지하고 있습니다.
515
+
516
+ - expected output: None
517
+ - context: []
518
+ - retrieval context: []
519
+
520
+ ======================================================================
521
+
522
+ Metrics Summary
523
+
524
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 1.00 because there were no irrelevant statements, indicating perfect relevancy! Great job!, error: None)
525
+
526
+ For test case:
527
+
528
+ - input: "최근 수율이 낮은 공정이 있나요?"
529
+ - actual output: A: 네, 몇 가지 주의가 필요한 공정이 있습니다:
530
+
531
+ R47 ENCLOSURE, LOWER, BATTERY, LARGE 사출: 59%
532
+ R47 ARM, FRONT RIGHT, UPPER 사출: 80%
533
+ Tab S10 FE FRONT BODY 사출: 87% 이 공정들은 90% 미만의 수율로 개선이 필요합니다.
534
+ - expected output: None
535
+ - context: []
536
+ - retrieval context: []
537
+
538
+ ======================================================================
539
+
540
+ Metrics Summary
541
+
542
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The answer relevancy score is perfect at 1.00! Great job maintaining focus and relevance., error: None)
543
+
544
+ For test case:
545
+
546
+ - input: "우리 회사 매출원가율이 높은 이유가 뭐야?"
547
+ - actual output: A: 2025년 1월 전사 매출원가율은 92%로 매우 높습니다. 주요 원인은:
548
+
549
+ 매입비(원부자재+외주가공비): 67% - 가장 큰 비중
550
+ 노무비: 12%
551
+ 제조경비: 11% 특히 베트남 사업부(94%)와 인도 사업부(92%)의 매출원가율이 높아 수익성 개선이 시급합니다.
552
+ - expected output: None
553
+ - context: []
554
+ - retrieval context: []
555
+
556
+ ======================================================================
557
+
558
+ Metrics Summary
559
+
560
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 1.00 because there were no irrelevant statements, indicating perfect alignment with the user's query! Great job!, error: None)
561
+
562
+ For test case:
563
+
564
+ - input: "SMF741UB6 조립 작업 시 주의사항이 뭐야?"
565
+ - actual output: A: SMF741UB6 FRONT DECO SUB 조립 작업표준서에 따른 주요 주의사항을 확인해야 합니다. 2024년 7월 8일에 조립 부분이 수정된 최신 버전을 참고하시기 바랍니다.
566
+ - expected output: None
567
+ - context: []
568
+ - retrieval context: []
569
+
570
+ ======================================================================
571
+
572
+ Metrics Summary
573
+
574
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 1.00 because there were no irrelevant statements, indicating perfect relevancy! Great job!, error: None)
575
+
576
+ For test case:
577
+
578
+ - input: "TAB S10 도장 공정 수율이 어떻게 되나요?"
579
+ - actual output: A: TAB S10 제품의 도장 공정 수율은 평균 98%로 매우 양호합니다. 세부적으로 TAB S10 REAR BODY 도장은 98%, TAB S10 KNOB 도장은 99%의 수율을 보이고 있습니다.
580
+ - expected output: None
581
+ - context: []
582
+ - retrieval context: []
583
+
584
+ ======================================================================
585
+
586
+ Metrics Summary
587
+
588
+ - ✅ Answer Relevancy (score: 0.8333333333333334, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.83 because the response contains a statement about the topic of the question, rather than directly addressing the 영업이익률, making it partially irrelevant., error: None)
589
+
590
+ For test case:
591
+
592
+ - input: "우리 회사 영업이익률은 몇 %야?"
593
+ - actual output: A: 2025년 1월 전사 영업이익률은 3%입니다. 영업이익은 8.97억원이며, 사업부별로는 한국 4%, 베트남 2%, 윈테크는 -7%의 영업이익률을 기록했습니다.
594
+
595
+ 생산/품질 관련 질문
596
+ - expected output: None
597
+ - context: []
598
+ - retrieval context: []
599
+
600
+ ======================================================================
601
+
602
+ Metrics Summary
603
+
604
+ - ✅ Answer Relevancy (score: 0.8571428571428571, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.86 because there was a question about work that was irrelevant to the cost of failure, but overall the response was still pretty relevant!, error: None)
605
+
606
+ For test case:
607
+
608
+ - input: "실패비용이 얼마나 발생했나요?"
609
+ - actual output: A: 2025년 1월 전사 실패비용은 5.16억원(매출 대비 2%)입니다. 사업부별로는:
610
+
611
+ 한국: 0.23억원 (1%)
612
+ 베트남: 3.95억원 (2%) - 가장 높음
613
+ 인도: 0.48억원 (1%)
614
+ 윈테크: 0.50억원 (1%) 베트남 사업부의 실패비용 절감이 필요합니다.
615
+ 작업 관련 질문
616
+ - expected output: None
617
+ - context: []
618
+ - retrieval context: []
619
+
620
+ ======================================================================
621
+
622
+ Metrics Summary
623
+
624
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The answer relevancy score is perfect! Great job!, error: None)
625
+
626
+ For test case:
627
+
628
+ - input: "이번 달 생산성이 가장 좋은 공정은?"
629
+ - actual output: A: 다음 공정들이 100% 수율을 달성했습니다:
630
+
631
+ SM-F936U NC 및 조립 공정
632
+ C18 SHIM 가공 및 사출
633
+ PA3 DECO 아노다이징, 샌딩, 버핑
634
+ 대부분의 조립(ASS'Y) 공정 이들 공정은 벤치마킹 대상으로 삼을 수 있습니다.
635
+ - expected output: None
636
+ - context: []
637
+ - retrieval context: []
638
+
639
+ ======================================================================
640
+
641
+ Metrics Summary
642
+
643
+ - ❌ Answer Relevancy (score: 0.6666666666666666, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.67 because the response contains information about cost, which is not directly related to the question about the availability of C18 products. However, it still addresses the general topic of C18 products, hence the non-zero score., error: None)
644
+
645
+ For test case:
646
+
647
+ - input: "C18 제품군 재고가 있나요?"
648
+ - actual output: A: C18 제품군은 모두 재고가 0인 상태입니다. CLAMSHELL COVER, ENCLOSURE 등 주요 부품들이 재고 소진 상태이므로 생산 계획 수립이 필요합니다.
649
+
650
+ 원가 관련 질문
651
+ - expected output: None
652
+ - context: []
653
+ - retrieval context: []
654
+
655
+ ======================================================================
656
+
657
+ Metrics Summary
658
+
659
+ - ✅ Answer Relevancy (score: 0.8333333333333334, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.83 because the output contains information about inventory, which is not directly related to the production status of Samsung foldable phone parts, but it still provides some relevant information., error: None)
660
+
661
+ For test case:
662
+
663
+ - input: "삼성 폴더블폰 부품(SM-F 시리즈) 생산 현황은?"
664
+ - actual output: A: 삼성 폴더블폰 부품 생산이 활발합니다:
665
+
666
+ SM-F721U: FRONT DECO MAIN/SUB NC 공정 수율 96-97%
667
+ SM-F731U: NC 공정 수율 97%, 조립 수율 100%
668
+ SM-F741U: NC 공정 수율 95%, 레이저 공정 수율 99%
669
+ SM-F936U: NC 및 조립 공정 모두 100% 수율 달성
670
+ 재고 관련 질문
671
+ - expected output: None
672
+ - context: []
673
+ - retrieval context: []
674
+
675
+ ======================================================================
676
+
677
+ Metrics Summary
678
+
679
+ - ✅ Answer Relevancy (score: 1.0, threshold: 0.8, strict: False, evaluation model: gemini-2.0-flash, reason: The answer relevancy score is 1.00 because there were no irrelevant statements, indicating perfect alignment with the input! Great job!, error: None)
680
+
681
+ For test case:
682
+
683
+ - input: "R47 시리즈 재고 현황이 어떻게 되나요?"
684
+ - actual output: A: R47 시리즈 주요 품목 재고 현황:
685
+
686
+ R47 ENCLOSURE, LOWER, BATTERY, LARGE 가공품: 568 EA (양품)
687
+ 기타 R47 부품들은 현재 재고가 없는 상태입니다.
688
+ 대부분 게이트 커팅 가공이나 사출 공정을 거치는 부품들입니다.
689
+ - expected output: None
690
+ - context: []
691
+ - retrieval context: []
692
+
693
+ ======================================================================
694
+
695
+ Overall Metric Pass Rates
696
+
697
+ Answer Relevancy: 91.67% pass rate
698
+
699
+ ======================================================================
700
+
701
+
702
+ ```
cache/cache_metadata.db ADDED
Binary file (20.5 kB). View file
 
csv_evaluation.py ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete CSV Question Evaluation Script for Manufacturing RAG Agent
4
+ """
5
+
6
+ import pandas as pd
7
+ import argparse
8
+ import logging
9
+ import os
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import List, Dict, Any, Optional
13
+ from datetime import datetime
14
+ import time
15
+ from dotenv import load_dotenv
16
+
17
+ # Load environment variables
18
+ load_dotenv()
19
+
20
+ # Add parent directory to path for imports
21
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22
+
23
+ # Setup logging
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
27
+ handlers=[
28
+ logging.FileHandler('csv_evaluation.log'),
29
+ logging.StreamHandler()
30
+ ]
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+ try:
35
+ from src.config import Config
36
+ from src.rag_engine import RAGEngine
37
+ from src.document_processor import DocumentProcessorFactory, DocumentType
38
+ from src.pdf_processor import PDFProcessor
39
+ from src.excel_processor import ExcelProcessor
40
+ from src.image_processor import ImageProcessor
41
+
42
+ except ImportError as e:
43
+ logger.error(f"Failed to import RAG components: {e}")
44
+ print(f"❌ Import Error: {e}")
45
+ print("Please ensure all src/ modules are properly structured and dependencies are installed")
46
+ sys.exit(1)
47
+
48
+
49
+ class CSVEvaluator:
50
+ """CSV-based question evaluation system."""
51
+
52
+ def __init__(self, config_path: str = "src/config.yaml"):
53
+ """Initialize the CSV evaluator."""
54
+ self.config_path = config_path
55
+ self.rag_engine = None
56
+ self.system_initialized = False
57
+
58
+ def initialize_system(self) -> bool:
59
+ """Initialize the RAG system."""
60
+ try:
61
+ logger.info("Initializing RAG system...")
62
+
63
+ # Load configuration
64
+ if not os.path.exists(self.config_path):
65
+ logger.error(f"Configuration file not found: {self.config_path}")
66
+ return False
67
+
68
+ config = Config(self.config_path)
69
+
70
+ # Validate required API keys
71
+ required_keys = {
72
+ 'GROQ_API_KEY': config.groq_api_key,
73
+ 'SILICONFLOW_API_KEY': config.siliconflow_api_key,
74
+ 'QDRANT_URL': config.qdrant_url
75
+ }
76
+
77
+ missing_keys = [k for k, v in required_keys.items() if not v]
78
+ if missing_keys:
79
+ logger.error(f"Missing required environment variables: {', '.join(missing_keys)}")
80
+ return False
81
+
82
+ # Create configuration dictionary
83
+ rag_config = config.rag_config
84
+ config_dict = {
85
+ # API configuration
86
+ 'siliconflow_api_key': config.siliconflow_api_key,
87
+ 'groq_api_key': config.groq_api_key,
88
+ 'qdrant_url': config.qdrant_url,
89
+ 'qdrant_api_key': config.qdrant_api_key,
90
+ 'qdrant_collection': 'manufacturing_docs',
91
+
92
+ # Model configuration
93
+ 'embedding_model': rag_config.get('embedding_model', 'Qwen/Qwen3-Embedding-8B'),
94
+ 'reranker_model': rag_config.get('reranker_model', 'Qwen/Qwen3-Reranker-8B'),
95
+ 'llm_model': rag_config.get('llm_model', 'openai/gpt-oss-120b'),
96
+
97
+ # RAG parameters
98
+ 'max_context_chunks': rag_config.get('max_context_chunks', 5),
99
+ 'similarity_threshold': rag_config.get('similarity_threshold', 0.7),
100
+ 'rerank_top_k': rag_config.get('rerank_top_k', 20),
101
+ 'final_top_k': rag_config.get('final_top_k', 5),
102
+ 'max_context_length': 4000,
103
+ 'vector_size': 1024,
104
+
105
+ # Performance settings
106
+ 'max_retries': 3,
107
+ 'temperature': rag_config.get('temperature', 0.1),
108
+ 'max_tokens': rag_config.get('max_tokens', 1024)
109
+ }
110
+
111
+ # Register document processors
112
+ DocumentProcessorFactory.register_processor(DocumentType.PDF, PDFProcessor)
113
+ DocumentProcessorFactory.register_processor(DocumentType.EXCEL, ExcelProcessor)
114
+ DocumentProcessorFactory.register_processor(DocumentType.IMAGE, ImageProcessor)
115
+
116
+ # Initialize RAG engine
117
+ self.rag_engine = RAGEngine(config_dict)
118
+
119
+ # Verify system health
120
+ health = self.rag_engine.health_check()
121
+ if not health.get('vector_store', False):
122
+ logger.warning("Vector store health check failed - this might affect performance")
123
+
124
+ if not health.get('llm_system', False):
125
+ logger.error("LLM system health check failed")
126
+ return False
127
+
128
+ self.system_initialized = True
129
+ logger.info("✅ RAG system initialized successfully")
130
+ return True
131
+
132
+ except Exception as e:
133
+ logger.error(f"Failed to initialize system: {e}")
134
+ return False
135
+
136
+ def load_questions_csv(self, csv_path: str, question_column: str = "question") -> pd.DataFrame:
137
+ """Load questions from CSV file."""
138
+ try:
139
+ if not os.path.exists(csv_path):
140
+ raise FileNotFoundError(f"CSV file not found: {csv_path}")
141
+
142
+ df = pd.read_csv(csv_path)
143
+ logger.info(f"Loaded {len(df)} questions from {csv_path}")
144
+
145
+ if question_column not in df.columns:
146
+ raise ValueError(f"Question column '{question_column}' not found in CSV. Available columns: {df.columns.tolist()}")
147
+
148
+ # Remove empty questions
149
+ original_count = len(df)
150
+ df = df[df[question_column].notna() & (df[question_column].str.strip() != "")]
151
+ final_count = len(df)
152
+
153
+ if original_count != final_count:
154
+ logger.info(f"Filtered out {original_count - final_count} empty questions")
155
+
156
+ return df
157
+
158
+ except Exception as e:
159
+ logger.error(f"Failed to load questions CSV: {e}")
160
+ raise
161
+
162
+ def evaluate_questions(self, questions_df: pd.DataFrame, question_column: str = "question",
163
+ batch_size: int = 10, delay_between_batches: float = 1.0) -> pd.DataFrame:
164
+ """Evaluate questions and return results DataFrame."""
165
+ if not self.system_initialized:
166
+ raise RuntimeError("System not initialized. Call initialize_system() first.")
167
+
168
+ results = []
169
+ total_questions = len(questions_df)
170
+
171
+ logger.info(f"Starting evaluation of {total_questions} questions...")
172
+
173
+ # Process questions in batches to avoid overwhelming the API
174
+ for batch_start in range(0, total_questions, batch_size):
175
+ batch_end = min(batch_start + batch_size, total_questions)
176
+ batch_df = questions_df.iloc[batch_start:batch_end]
177
+
178
+ logger.info(f"Processing batch {batch_start//batch_size + 1}/{(total_questions-1)//batch_size + 1} "
179
+ f"(questions {batch_start+1}-{batch_end})")
180
+
181
+ # Process each question in the batch
182
+ for idx, row in batch_df.iterrows():
183
+ question = row[question_column]
184
+
185
+ try:
186
+ logger.info(f"Processing question {idx+1}: {question[:50]}...")
187
+
188
+ # Get answer from RAG system
189
+ start_time = time.time()
190
+ response = self.rag_engine.answer_question(question)
191
+ processing_time = time.time() - start_time
192
+
193
+ # Extract result information
194
+ result = {
195
+ 'question_id': idx,
196
+ 'question': question,
197
+ 'answer': response.answer if response.success else "Error: Could not generate answer",
198
+ 'success': response.success,
199
+ 'confidence_score': response.confidence_score if response.success else 0.0,
200
+ 'processing_time': processing_time,
201
+ 'retrieval_time': response.retrieval_time if response.success else 0.0,
202
+ 'generation_time': response.generation_time if response.success else 0.0,
203
+ 'sources_count': len(response.citations) if response.success else 0,
204
+ 'chunks_retrieved': response.total_chunks_retrieved if response.success else 0,
205
+ 'model_used': response.model_used if response.success else "N/A",
206
+ 'error_message': response.error_message if not response.success else "",
207
+ 'timestamp': datetime.now().isoformat()
208
+ }
209
+
210
+ # Add citations information
211
+ if response.success and response.citations:
212
+ citations_info = []
213
+ for i, citation in enumerate(response.citations):
214
+ citation_text = f"Source {i+1}: {citation.source_file}"
215
+ if citation.page_number:
216
+ citation_text += f" (Page {citation.page_number})"
217
+ if citation.worksheet_name:
218
+ citation_text += f" (Sheet: {citation.worksheet_name})"
219
+ citations_info.append(citation_text)
220
+
221
+ result['citations'] = " | ".join(citations_info)
222
+ result['top_citation_confidence'] = max([c.confidence for c in response.citations])
223
+ else:
224
+ result['citations'] = ""
225
+ result['top_citation_confidence'] = 0.0
226
+
227
+ # Copy additional columns from original CSV
228
+ for col in row.index:
229
+ if col != question_column and col not in result:
230
+ result[col] = row[col]
231
+
232
+ results.append(result)
233
+
234
+ # Log success
235
+ if response.success:
236
+ logger.info(f"✅ Question {idx+1} processed successfully "
237
+ f"(confidence: {response.confidence_score:.2f}, "
238
+ f"time: {processing_time:.2f}s)")
239
+ else:
240
+ logger.warning(f"⚠️ Question {idx+1} failed: {response.error_message}")
241
+
242
+ except Exception as e:
243
+ logger.error(f"❌ Error processing question {idx+1}: {e}")
244
+
245
+ # Add error result
246
+ error_result = {
247
+ 'question_id': idx,
248
+ 'question': question,
249
+ 'answer': f"Error: {str(e)}",
250
+ 'success': False,
251
+ 'confidence_score': 0.0,
252
+ 'processing_time': 0.0,
253
+ 'retrieval_time': 0.0,
254
+ 'generation_time': 0.0,
255
+ 'sources_count': 0,
256
+ 'chunks_retrieved': 0,
257
+ 'model_used': "N/A",
258
+ 'error_message': str(e),
259
+ 'citations': "",
260
+ 'top_citation_confidence': 0.0,
261
+ 'timestamp': datetime.now().isoformat()
262
+ }
263
+
264
+ # Copy additional columns
265
+ for col in row.index:
266
+ if col != question_column and col not in error_result:
267
+ error_result[col] = row[col]
268
+
269
+ results.append(error_result)
270
+
271
+ # Small delay between questions
272
+ time.sleep(0.5)
273
+
274
+ # Delay between batches
275
+ if batch_end < total_questions:
276
+ logger.info(f"Waiting {delay_between_batches}s before next batch...")
277
+ time.sleep(delay_between_batches)
278
+
279
+ logger.info(f"Completed evaluation of {len(results)} questions")
280
+ return pd.DataFrame(results)
281
+
282
+ def save_results(self, results_df: pd.DataFrame, output_path: str,
283
+ include_summary: bool = True) -> str:
284
+ """Save results to CSV file and optionally create summary."""
285
+ try:
286
+ # Ensure output directory exists
287
+ output_file = Path(output_path)
288
+ output_file.parent.mkdir(parents=True, exist_ok=True)
289
+
290
+ # Save main results
291
+ results_df.to_csv(output_path, index=False)
292
+ logger.info(f"Results saved to {output_path}")
293
+
294
+ # Create summary if requested
295
+ if include_summary:
296
+ summary_path = output_file.with_suffix('.summary.txt')
297
+ summary = self._generate_summary(results_df)
298
+
299
+ with open(summary_path, 'w', encoding='utf-8') as f:
300
+ f.write(summary)
301
+
302
+ logger.info(f"Summary saved to {summary_path}")
303
+ return str(summary_path)
304
+
305
+ return output_path
306
+
307
+ except Exception as e:
308
+ logger.error(f"Failed to save results: {e}")
309
+ raise
310
+
311
+ def _generate_summary(self, results_df: pd.DataFrame) -> str:
312
+ """Generate evaluation summary."""
313
+ total_questions = len(results_df)
314
+ successful_questions = len(results_df[results_df['success'] == True])
315
+ failed_questions = total_questions - successful_questions
316
+
317
+ success_rate = (successful_questions / total_questions * 100) if total_questions > 0 else 0
318
+
319
+ # Calculate statistics for successful questions
320
+ successful_df = results_df[results_df['success'] == True]
321
+
322
+ if len(successful_df) > 0:
323
+ avg_confidence = successful_df['confidence_score'].mean()
324
+ avg_processing_time = successful_df['processing_time'].mean()
325
+ avg_sources = successful_df['sources_count'].mean()
326
+ avg_chunks = successful_df['chunks_retrieved'].mean()
327
+ else:
328
+ avg_confidence = avg_processing_time = avg_sources = avg_chunks = 0
329
+
330
+ # Generate summary text
331
+ summary = f"""
332
+ === Manufacturing RAG Agent - CSV Evaluation Summary ===
333
+ Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
334
+
335
+ 📊 Overall Results:
336
+ • Total Questions: {total_questions}
337
+ • Successful Answers: {successful_questions}
338
+ • Failed Answers: {failed_questions}
339
+ • Success Rate: {success_rate:.1f}%
340
+
341
+ 📈 Performance Metrics (Successful Questions):
342
+ • Average Confidence Score: {avg_confidence:.3f}
343
+ • Average Processing Time: {avg_processing_time:.2f}s
344
+ • Average Sources per Answer: {avg_sources:.1f}
345
+ • Average Chunks Retrieved: {avg_chunks:.1f}
346
+
347
+ 📋 Detailed Breakdown:
348
+ """
349
+
350
+ # Add confidence distribution
351
+ if len(successful_df) > 0:
352
+ confidence_ranges = [
353
+ (0.9, 1.0, "Very High (0.9-1.0)"),
354
+ (0.7, 0.9, "High (0.7-0.9)"),
355
+ (0.5, 0.7, "Medium (0.5-0.7)"),
356
+ (0.0, 0.5, "Low (0.0-0.5)")
357
+ ]
358
+
359
+ summary += "\n🎯 Confidence Score Distribution:\n"
360
+ for min_conf, max_conf, label in confidence_ranges:
361
+ count = len(successful_df[
362
+ (successful_df['confidence_score'] >= min_conf) &
363
+ (successful_df['confidence_score'] < max_conf)
364
+ ])
365
+ percentage = (count / len(successful_df) * 100) if len(successful_df) > 0 else 0
366
+ summary += f"• {label}: {count} questions ({percentage:.1f}%)\n"
367
+
368
+ # Add processing time distribution
369
+ if len(successful_df) > 0:
370
+ summary += "\n⏱️ Processing Time Distribution:\n"
371
+ time_ranges = [
372
+ (0, 1, "Very Fast (0-1s)"),
373
+ (1, 3, "Fast (1-3s)"),
374
+ (3, 5, "Medium (3-5s)"),
375
+ (5, float('inf'), "Slow (5s+)")
376
+ ]
377
+
378
+ for min_time, max_time, label in time_ranges:
379
+ if max_time == float('inf'):
380
+ count = len(successful_df[successful_df['processing_time'] >= min_time])
381
+ else:
382
+ count = len(successful_df[
383
+ (successful_df['processing_time'] >= min_time) &
384
+ (successful_df['processing_time'] < max_time)
385
+ ])
386
+ percentage = (count / len(successful_df) * 100) if len(successful_df) > 0 else 0
387
+ summary += f"• {label}: {count} questions ({percentage:.1f}%)\n"
388
+
389
+ # Add error analysis
390
+ if failed_questions > 0:
391
+ summary += f"\n❌ Error Analysis:\n"
392
+ error_counts = results_df[results_df['success'] == False]['error_message'].value_counts()
393
+ for error, count in error_counts.head(5).items():
394
+ summary += f"• {error}: {count} occurrences\n"
395
+
396
+ # Add top performing questions
397
+ if len(successful_df) > 0:
398
+ summary += f"\n🏆 Top 5 Questions by Confidence:\n"
399
+ top_questions = successful_df.nlargest(5, 'confidence_score')
400
+ for idx, row in top_questions.iterrows():
401
+ question_preview = row['question'][:60] + "..." if len(row['question']) > 60 else row['question']
402
+ summary += f"• {question_preview} (Confidence: {row['confidence_score']:.3f})\n"
403
+
404
+ return summary
405
+
406
+
407
+ def create_sample_csv(output_path: str = "sample_questions.csv"):
408
+ """Create a sample CSV file with example questions."""
409
+ sample_questions = [
410
+ "What is the production yield mentioned in the documents?",
411
+ "What are the main quality control processes?",
412
+ "What is the average processing time for manufacturing?",
413
+ "What materials are used in the production process?",
414
+ "What are the safety requirements mentioned?",
415
+ "What is the capacity of the manufacturing line?",
416
+ "What quality metrics are tracked?",
417
+ "What is the maintenance schedule?",
418
+ "What are the operating temperatures?",
419
+ "What certifications are required?"
420
+ ]
421
+
422
+ df = pd.DataFrame({
423
+ 'id': range(1, len(sample_questions) + 1),
424
+ 'question': sample_questions,
425
+ 'category': ['production', 'quality', 'process', 'materials', 'safety',
426
+ 'capacity', 'metrics', 'maintenance', 'operations', 'compliance']
427
+ })
428
+
429
+ df.to_csv(output_path, index=False)
430
+ print(f"📝 Sample CSV created: {output_path}")
431
+ return output_path
432
+
433
+
434
+ def main():
435
+ """Main function for command-line usage."""
436
+ parser = argparse.ArgumentParser(description="Evaluate questions from CSV using Manufacturing RAG Agent")
437
+
438
+ parser.add_argument(
439
+ "input_csv",
440
+ nargs='?',
441
+ help="Path to input CSV file containing questions"
442
+ )
443
+
444
+ parser.add_argument(
445
+ "--create-sample",
446
+ action="store_true",
447
+ help="Create a sample CSV file with example questions"
448
+ )
449
+
450
+ parser.add_argument(
451
+ "--output-csv",
452
+ "-o",
453
+ help="Path to output CSV file for results (default: input_file_results.csv)"
454
+ )
455
+
456
+ parser.add_argument(
457
+ "--question-column",
458
+ "-q",
459
+ default="question",
460
+ help="Column name containing questions (default: 'question')"
461
+ )
462
+
463
+ parser.add_argument(
464
+ "--config",
465
+ "-c",
466
+ default="src/config.yaml",
467
+ help="Path to configuration file (default: src/config.yaml)"
468
+ )
469
+
470
+ parser.add_argument(
471
+ "--batch-size",
472
+ "-b",
473
+ type=int,
474
+ default=10,
475
+ help="Number of questions to process in each batch (default: 10)"
476
+ )
477
+
478
+ parser.add_argument(
479
+ "--delay",
480
+ "-d",
481
+ type=float,
482
+ default=1.0,
483
+ help="Delay between batches in seconds (default: 1.0)"
484
+ )
485
+
486
+ parser.add_argument(
487
+ "--no-summary",
488
+ action="store_true",
489
+ help="Skip generating summary file"
490
+ )
491
+
492
+ parser.add_argument(
493
+ "--verbose",
494
+ "-v",
495
+ action="store_true",
496
+ help="Enable verbose logging"
497
+ )
498
+
499
+ args = parser.parse_args()
500
+
501
+ # Handle create sample option
502
+ if args.create_sample:
503
+ sample_path = args.input_csv if args.input_csv else "sample_questions.csv"
504
+ create_sample_csv(sample_path)
505
+ print("\n🚀 To run evaluation:")
506
+ print(f"python {sys.argv[0]} {sample_path}")
507
+ return
508
+
509
+ # Validate input file
510
+ if not args.input_csv:
511
+ print("❌ Please provide an input CSV file or use --create-sample to create one")
512
+ parser.print_help()
513
+ sys.exit(1)
514
+
515
+ # Set logging level
516
+ if args.verbose:
517
+ logging.getLogger().setLevel(logging.DEBUG)
518
+
519
+ try:
520
+ # Validate input file
521
+ if not os.path.exists(args.input_csv):
522
+ print(f"❌ Input CSV file not found: {args.input_csv}")
523
+ sys.exit(1)
524
+
525
+ # Generate output path if not provided
526
+ if not args.output_csv:
527
+ input_path = Path(args.input_csv)
528
+ args.output_csv = str(input_path.parent / f"{input_path.stem}_results.csv")
529
+
530
+ print(f"🏭 Manufacturing RAG Agent - CSV Evaluation")
531
+ print(f"Input: {args.input_csv}")
532
+ print(f"Output: {args.output_csv}")
533
+ print(f"Question Column: {args.question_column}")
534
+ print(f"Config: {args.config}")
535
+ print("-" * 50)
536
+
537
+ # Initialize evaluator
538
+ print("🚀 Initializing RAG system...")
539
+ evaluator = CSVEvaluator(args.config)
540
+
541
+ if not evaluator.initialize_system():
542
+ print("❌ Failed to initialize RAG system")
543
+ sys.exit(1)
544
+
545
+ print("✅ RAG system initialized successfully")
546
+
547
+ # Load questions
548
+ print(f"📄 Loading questions from {args.input_csv}...")
549
+ questions_df = evaluator.load_questions_csv(args.input_csv, args.question_column)
550
+ print(f"✅ Loaded {len(questions_df)} questions")
551
+
552
+ # Evaluate questions
553
+ print("🔍 Starting evaluation...")
554
+ start_time = time.time()
555
+
556
+ results_df = evaluator.evaluate_questions(
557
+ questions_df,
558
+ question_column=args.question_column,
559
+ batch_size=args.batch_size,
560
+ delay_between_batches=args.delay
561
+ )
562
+
563
+ total_time = time.time() - start_time
564
+
565
+ # Save results
566
+ print(f"💾 Saving results to {args.output_csv}...")
567
+ summary_path = evaluator.save_results(
568
+ results_df,
569
+ args.output_csv,
570
+ include_summary=not args.no_summary
571
+ )
572
+
573
+ # Print final summary
574
+ successful = len(results_df[results_df['success'] == True])
575
+ success_rate = (successful / len(results_df) * 100) if len(results_df) > 0 else 0
576
+
577
+ print("\n" + "=" * 50)
578
+ print("🎉 Evaluation Complete!")
579
+ print(f"📊 Results: {successful}/{len(results_df)} questions answered successfully ({success_rate:.1f}%)")
580
+ print(f"⏱️ Total time: {total_time:.2f} seconds")
581
+ print(f"💾 Results saved to: {args.output_csv}")
582
+
583
+ if not args.no_summary:
584
+ print(f"📋 Summary saved to: {summary_path}")
585
+
586
+ print("\n🔍 Quick Preview of Results:")
587
+ if len(results_df) > 0:
588
+ preview_df = results_df[['question', 'answer', 'success', 'confidence_score']].head(3)
589
+ for idx, row in preview_df.iterrows():
590
+ status = "✅" if row['success'] else "❌"
591
+ conf = f"({row['confidence_score']:.2f})" if row['success'] else ""
592
+ question_preview = row['question'][:40] + "..." if len(row['question']) > 40 else row['question']
593
+ answer_preview = str(row['answer'])[:60] + "..." if len(str(row['answer'])) > 60 else str(row['answer'])
594
+ print(f"{status} Q: {question_preview}")
595
+ print(f" A: {answer_preview} {conf}")
596
+ print()
597
+
598
+ except KeyboardInterrupt:
599
+ print("\n🛑 Evaluation interrupted by user")
600
+ sys.exit(1)
601
+ except Exception as e:
602
+ logger.error(f"Evaluation failed: {e}")
603
+ print(f"❌ Evaluation failed: {e}")
604
+ sys.exit(1)
605
+
606
+
607
+ if __name__ == "__main__":
608
+ main()
data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/.gitkeep ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # This file ensures the data directory is tracked by git
2
+ # The data directory will contain:
3
+ # - documents/ - uploaded files
4
+ # - metadata.db - document metadata database
5
+ # - vector_store/ - vector database files (if using local storage)
data/documents/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+ # Directory for storing uploaded documents
data/metadata.db ADDED
Binary file (36.9 kB). View file
 
demo.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import subprocess
3
+ import os
4
+ from pathlib import Path
5
+
6
+ def print_banner():
7
+ """Print welcome banner."""
8
+ print("=" * 70)
9
+ print("🇰🇷 Korean Q&A Evaluation System - Demo Launcher")
10
+ print("=" * 70)
11
+ print()
12
+
13
+ def print_options():
14
+ """Print available demo options."""
15
+ print("📱 Available Demo Interfaces:")
16
+ print()
17
+ print("1. 🎯 Gradio Demo (Recommended)")
18
+ print(" - Interactive web interface")
19
+ print(" - Real-time evaluation")
20
+ print(" - Shareable public links")
21
+ print(" - Best for: Quick testing and sharing")
22
+ print()
23
+ print("2. 📊 Streamlit Dashboard")
24
+ print(" - Professional analytics interface")
25
+ print(" - Advanced visualizations")
26
+ print(" - Detailed metrics analysis")
27
+ print(" - Best for: In-depth analysis")
28
+ print()
29
+ print("3. 🖥️ Command Line Interface")
30
+ print(" - Traditional CLI evaluation")
31
+ print(" - Batch processing")
32
+ print(" - Automated workflows")
33
+ print(" - Best for: Production use")
34
+ print()
35
+ print("4. 🧪 Test Setup")
36
+ print(" - Verify system configuration")
37
+ print(" - Check dependencies")
38
+ print(" - Validate API keys")
39
+ print()
40
+ print("0. ❌ Exit")
41
+ print()
42
+
43
+ def launch_gradio():
44
+ """Launch Gradio demo."""
45
+ print("🚀 Launching Gradio Demo...")
46
+ print("📱 Will be available at: http://localhost:7860")
47
+ print("🌐 Public link will be generated for sharing")
48
+ print()
49
+ try:
50
+ subprocess.run([sys.executable, "gradio_demo.py"], check=True)
51
+ except KeyboardInterrupt:
52
+ print("\n👋 Gradio demo stopped")
53
+ except Exception as e:
54
+ print(f"❌ Error launching Gradio: {e}")
55
+
56
+ def launch_streamlit():
57
+ """Launch Streamlit demo."""
58
+ print("🚀 Launching Streamlit Dashboard...")
59
+ print("📱 Will be available at: http://localhost:8501")
60
+ print()
61
+ try:
62
+ subprocess.run([
63
+ sys.executable, "-m", "streamlit", "run", "streamlit_demo.py",
64
+ "--server.port", "8501",
65
+ "--server.address", "0.0.0.0",
66
+ "--browser.gatherUsageStats", "false"
67
+ ], check=True)
68
+ except KeyboardInterrupt:
69
+ print("\n👋 Streamlit demo stopped")
70
+ except Exception as e:
71
+ print(f"❌ Error launching Streamlit: {e}")
72
+
73
+ def launch_cli():
74
+ """Launch CLI interface."""
75
+ print("🖥️ Command Line Interface Options:")
76
+ print()
77
+ print("1. Quick evaluation with default settings")
78
+ print("2. Custom evaluation with parameters")
79
+ print("3. Back to main menu")
80
+ print()
81
+
82
+ choice = input("Select option (1-3): ").strip()
83
+
84
+ if choice == "1":
85
+ print("🚀 Running quick evaluation...")
86
+ try:
87
+ subprocess.run([sys.executable, "run_evaluation.py"], check=True)
88
+ except Exception as e:
89
+ print(f"❌ Error: {e}")
90
+
91
+ elif choice == "2":
92
+ print("📝 Custom Evaluation Parameters:")
93
+ dataset = input("Dataset path (default: assets/bench_korean.csv): ").strip()
94
+ if not dataset:
95
+ dataset = "assets/bench_korean.csv"
96
+
97
+ threshold = input("Threshold (default: 0.8): ").strip()
98
+ if not threshold:
99
+ threshold = "0.8"
100
+
101
+ log_level = input("Log level (DEBUG/INFO/WARNING/ERROR, default: INFO): ").strip()
102
+ if not log_level:
103
+ log_level = "INFO"
104
+
105
+ verbose = input("Verbose mode? (y/n, default: y): ").strip().lower()
106
+ verbose_flag = "--verbose" if verbose != "n" else ""
107
+
108
+ cmd = [
109
+ sys.executable, "main.py",
110
+ "--dataset", dataset,
111
+ "--threshold", threshold,
112
+ "--log-level", log_level
113
+ ]
114
+ if verbose_flag:
115
+ cmd.append(verbose_flag)
116
+
117
+ print(f"🚀 Running: {' '.join(cmd)}")
118
+ try:
119
+ subprocess.run(cmd, check=True)
120
+ except Exception as e:
121
+ print(f"❌ Error: {e}")
122
+
123
+ elif choice == "3":
124
+ return
125
+ else:
126
+ print("❌ Invalid option")
127
+
128
+ def test_setup():
129
+ """Run setup test."""
130
+ print("🧪 Testing system setup...")
131
+ try:
132
+ subprocess.run([sys.executable, "test_setup.py"], check=True)
133
+ except Exception as e:
134
+ print(f"❌ Setup test failed: {e}")
135
+
136
+ def check_dependencies():
137
+ """Check if required packages are installed."""
138
+ required_packages = [
139
+ "gradio", "streamlit", "plotly", "deepeval",
140
+ "pandas"
141
+ ]
142
+
143
+ missing_packages = []
144
+
145
+ for package in required_packages:
146
+ try:
147
+ __import__(package)
148
+ except ImportError:
149
+ missing_packages.append(package)
150
+
151
+ if missing_packages:
152
+ print("⚠️ Missing required packages:")
153
+ for package in missing_packages:
154
+ print(f" - {package}")
155
+ print()
156
+ print("📦 Install missing packages with:")
157
+ print(" pip install -r requirements.txt")
158
+ print()
159
+ return False
160
+
161
+ return True
162
+
163
+ def main():
164
+ """Main demo launcher."""
165
+ print_banner()
166
+
167
+ # Check dependencies
168
+ if not check_dependencies():
169
+ sys.exit(1)
170
+
171
+ while True:
172
+ print_options()
173
+ choice = input("Select an option (0-4): ").strip()
174
+
175
+ if choice == "1":
176
+ launch_gradio()
177
+ elif choice == "2":
178
+ launch_streamlit()
179
+ elif choice == "3":
180
+ launch_cli()
181
+ elif choice == "4":
182
+ test_setup()
183
+ elif choice == "0":
184
+ print("👋 Goodbye!")
185
+ break
186
+ else:
187
+ print("❌ Invalid option. Please try again.")
188
+
189
+ print("\n" + "=" * 50 + "\n")
190
+
191
+ if __name__ == "__main__":
192
+ try:
193
+ main()
194
+ except KeyboardInterrupt:
195
+ print("\n👋 Demo launcher stopped")
196
+ except Exception as e:
197
+ print(f"❌ Unexpected error: {e}")
198
+ sys.exit(1)
direct_rag_loader.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Direct Document Loading Script for RAG Pipeline
4
+ This script loads documents directly from a data directory into the RAG system
5
+ and provides an interactive question-answering interface.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import List, Dict, Any, Optional
13
+ import time
14
+ from dotenv import load_dotenv
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Add src to path
20
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
21
+
22
+ try:
23
+ from src.config import Config
24
+ from src.ingestion_pipeline import DocumentIngestionPipeline, IngestionResult
25
+ from src.rag_engine import RAGEngine, RAGResponse
26
+ from src.metadata_manager import MetadataManager
27
+ from src.vector_store import QdrantVectorStore, QdrantClient
28
+ from src.embedding_system import EmbeddingSystem, RerankResult
29
+ from logger.custom_logger import CustomLoggerTracker
30
+ from src.document_processor import ProcessingStatus, DocumentProcessorFactory, DocumentType
31
+ from src.pdf_processor import PDFProcessor
32
+ from src.excel_processor import ExcelProcessor
33
+ from src.image_processor import ImageProcessor
34
+
35
+ # Initialize logger
36
+ custom_log = CustomLoggerTracker()
37
+ logger = custom_log.get_logger("direct_rag_loader")
38
+
39
+ except ImportError as e:
40
+ print(f"Failed to import RAG components: {e}")
41
+ print("Please ensure all src/ modules are available and properly structured.")
42
+ sys.exit(1)
43
+
44
+
45
+ class DirectRAGLoader:
46
+ """
47
+ Direct document loader for RAG system.
48
+ Loads documents from a specified directory and enables question answering.
49
+ """
50
+
51
+ def __init__(self, data_directory: str = "data", config_path: str = "src/config.yaml"):
52
+ """
53
+ Initialize the RAG loader.
54
+
55
+ Args:
56
+ data_directory: Directory containing documents to load
57
+ config_path: Path to configuration file
58
+ """
59
+ self.data_directory = Path(data_directory)
60
+ self.config_path = config_path
61
+
62
+ # RAG components
63
+ self.config = None
64
+ self.ingestion_pipeline = None
65
+ self.rag_engine = None
66
+ self.metadata_manager = None
67
+
68
+ # Document tracking
69
+ self.loaded_documents = []
70
+ self.processing_results = []
71
+
72
+ logger.info(f"DirectRAGLoader initialized for directory: {self.data_directory}")
73
+
74
+ def initialize_system(self) -> bool:
75
+ """
76
+ Initialize the RAG system components.
77
+
78
+ Returns:
79
+ True if successful, False otherwise
80
+ """
81
+ try:
82
+ logger.info("Initializing RAG system...")
83
+
84
+ # Check if config file exists
85
+ if not Path(self.config_path).exists():
86
+ logger.error(f"Configuration file not found: {self.config_path}")
87
+ return False
88
+
89
+ # Load configuration
90
+ self.config = Config(self.config_path)
91
+ logger.info("Configuration loaded successfully")
92
+
93
+ # Initialize components with config
94
+ config_dict = {
95
+ 'siliconflow_api_key': self.config.siliconflow_api_key,
96
+ 'groq_api_key': self.config.groq_api_key,
97
+ 'qdrant_url': self.config.qdrant_url,
98
+ 'qdrant_api_key': self.config.qdrant_api_key,
99
+ **self.config.rag_config,
100
+ **self.config.document_processing_config,
101
+ **self.config.storage_config
102
+ }
103
+
104
+ # Initialize core components
105
+ self.ingestion_pipeline = DocumentIngestionPipeline(config_dict)
106
+ self.rag_engine = RAGEngine(config_dict)
107
+ self.metadata_manager = MetadataManager(config_dict)
108
+ # Register document processors
109
+ DocumentProcessorFactory.register_processor(DocumentType.PDF, PDFProcessor)
110
+ DocumentProcessorFactory.register_processor(DocumentType.EXCEL, ExcelProcessor)
111
+ DocumentProcessorFactory.register_processor(DocumentType.IMAGE, ImageProcessor)
112
+
113
+ logger.info("RAG system initialized successfully")
114
+ return True
115
+ except Exception as e:
116
+ logger.error(f"Failed to initialize RAG system: {e}")
117
+
118
+
119
+ def discover_documents(self) -> List[Path]:
120
+ if not self.data_directory.exists():
121
+ logger.error(f"Data directory does not exist: {self.data_directory}")
122
+ return []
123
+
124
+ # Supported file extensions
125
+ supported_extensions = ['.pdf', '.xlsx', '.xls', '.xlsm', '.png', '.jpg', '.jpeg', '.csv', '.txt']
126
+
127
+ documents = []
128
+ for ext in supported_extensions:
129
+ documents.extend(self.data_directory.glob(f"*{ext}"))
130
+ documents.extend(self.data_directory.glob(f"**/*{ext}")) # Recursive search
131
+
132
+ # Remove duplicates and sort
133
+ documents = sorted(list(set(documents)))
134
+
135
+ logger.info(f"Found {len(documents)} documents in {self.data_directory}")
136
+ for doc in documents:
137
+ logger.info(f" - {doc.name} ({doc.suffix})")
138
+
139
+ return documents
140
+
141
+ def load_documents(self, document_paths: Optional[List[Path]] = None) -> bool:
142
+ """
143
+ Load documents into the RAG system.
144
+
145
+ Args:
146
+ document_paths: Optional list of specific documents to load.
147
+ If None, loads all discovered documents.
148
+
149
+ Returns:
150
+ True if at least one document was loaded successfully
151
+ """
152
+ if not self.ingestion_pipeline:
153
+ logger.error("RAG system not initialized. Call initialize_system() first.")
154
+ return False
155
+
156
+ # Discover documents if not provided
157
+ if document_paths is None:
158
+ document_paths = self.discover_documents()
159
+
160
+ if not document_paths:
161
+ logger.warning("No documents found to load")
162
+ return False
163
+
164
+ logger.info(f"Starting batch ingestion of {len(document_paths)} documents...")
165
+
166
+ # Convert Path objects to strings
167
+ file_paths = [str(path) for path in document_paths]
168
+
169
+ # Process documents in batch
170
+ start_time = time.time()
171
+ batch_stats = self.ingestion_pipeline.ingest_batch(file_paths, max_workers=2)
172
+
173
+ # Store results
174
+ self.processing_results = batch_stats
175
+
176
+ # Log results
177
+ logger.info("=" * 60)
178
+ logger.info("BATCH PROCESSING RESULTS")
179
+ logger.info("=" * 60)
180
+ logger.info(f"Total documents: {batch_stats.total_documents}")
181
+ logger.info(f"Successful: {batch_stats.successful_documents}")
182
+ logger.info(f"Failed: {batch_stats.failed_documents}")
183
+ logger.info(f"Total chunks created: {batch_stats.total_chunks}")
184
+ logger.info(f"Processing time: {batch_stats.total_processing_time:.2f}s")
185
+ logger.info(f"Average time per document: {batch_stats.average_processing_time:.2f}s")
186
+
187
+ if batch_stats.documents_by_type:
188
+ logger.info("Documents by type:")
189
+ for doc_type, count in batch_stats.documents_by_type.items():
190
+ logger.info(f" {doc_type}: {count}")
191
+
192
+ if batch_stats.errors:
193
+ logger.warning("Errors encountered:")
194
+ for error in batch_stats.errors:
195
+ logger.warning(f" - {error}")
196
+
197
+ logger.info("=" * 60)
198
+
199
+ return batch_stats.successful_documents > 0
200
+
201
+ def ask_question(self, question: str, max_results: int = 5,
202
+ show_citations: bool = True) -> Optional[RAGResponse]:
203
+ """
204
+ Ask a question to the RAG system.
205
+
206
+ Args:
207
+ question: Question to ask
208
+ max_results: Maximum number of context chunks to use
209
+ show_citations: Whether to display citations
210
+
211
+ Returns:
212
+ RAGResponse object or None if failed
213
+ """
214
+ if not self.rag_engine:
215
+ logger.error("RAG system not initialized. Call initialize_system() first.")
216
+ return None
217
+
218
+ try:
219
+ logger.info(f"Processing question: {question}")
220
+
221
+ # Temporarily adjust RAG engine parameters
222
+ original_top_k = self.rag_engine.final_top_k
223
+ self.rag_engine.final_top_k = max_results
224
+
225
+ # Get response
226
+ response = self.rag_engine.answer_question(question)
227
+
228
+ # Restore original parameter
229
+ self.rag_engine.final_top_k = original_top_k
230
+
231
+ # Display response
232
+ self._display_response(response, show_citations)
233
+
234
+ return response
235
+
236
+ except Exception as e:
237
+ logger.error(f"Failed to process question: {e}")
238
+ return None
239
+
240
+ def _display_response(self, response: RAGResponse, show_citations: bool = True):
241
+ """Display RAG response in a formatted way."""
242
+ print("\n" + "="*60)
243
+ print("🤖 RAG SYSTEM RESPONSE")
244
+ print("="*60)
245
+
246
+ if not response.success:
247
+ print(f"❌ Error: {response.error_message}")
248
+ return
249
+
250
+ # Main answer
251
+ print(f"📝 Answer:")
252
+ print(f"{response.answer}")
253
+ print()
254
+
255
+ # Metrics
256
+ print(f"📊 Metrics:")
257
+ print(f" • Confidence Score: {response.confidence_score:.3f}")
258
+ print(f" • Processing Time: {response.processing_time:.3f}s")
259
+ print(f" • Sources Used: {len(response.citations)}")
260
+ print(f" • Chunks Retrieved: {response.total_chunks_retrieved}")
261
+ print(f" • Model Used: {response.model_used}")
262
+ print()
263
+
264
+ # Performance breakdown
265
+ print(f"⚡ Performance Breakdown:")
266
+ print(f" • Retrieval: {response.retrieval_time:.3f}s")
267
+ print(f" • Reranking: {response.rerank_time:.3f}s")
268
+ print(f" • Generation: {response.generation_time:.3f}s")
269
+ print()
270
+
271
+ # Citations
272
+ if show_citations and response.citations:
273
+ print(f"📚 Sources & Citations:")
274
+ for i, citation in enumerate(response.citations, 1):
275
+ print(f" [{i}] {citation.source_file}")
276
+
277
+ # Location details
278
+ location_parts = []
279
+ if citation.page_number:
280
+ location_parts.append(f"Page {citation.page_number}")
281
+ if citation.worksheet_name:
282
+ location_parts.append(f"Sheet: {citation.worksheet_name}")
283
+ if citation.cell_range:
284
+ location_parts.append(f"Range: {citation.cell_range}")
285
+ if citation.section_title:
286
+ location_parts.append(f"Section: {citation.section_title}")
287
+
288
+ if location_parts:
289
+ print(f" 📍 {' | '.join(location_parts)}")
290
+
291
+ print(f" 📈 Confidence: {citation.confidence:.3f}")
292
+ print(f" 📄 Snippet: {citation.text_snippet[:100]}...")
293
+ print()
294
+
295
+ print("="*60)
296
+
297
+ def interactive_qa_session(self):
298
+ """Start an interactive question-answering session."""
299
+ print("\n" + "="*60)
300
+ print("🤖 INTERACTIVE Q&A SESSION")
301
+ print("="*60)
302
+ print("Enter your questions below. Type 'quit', 'exit', or 'q' to stop.")
303
+ print("Type 'status' to see system status.")
304
+ print("Type 'docs' to see loaded documents.")
305
+ print("="*60)
306
+
307
+ while True:
308
+ try:
309
+ # Get user input
310
+ question = input("\n❓ Your question: ").strip()
311
+
312
+ if not question:
313
+ continue
314
+
315
+ # Check for special commands
316
+ if question.lower() in ['quit', 'exit', 'q']:
317
+ print("👋 Goodbye!")
318
+ break
319
+ elif question.lower() == 'status':
320
+ self._show_system_status()
321
+ continue
322
+ elif question.lower() == 'docs':
323
+ self._show_loaded_documents()
324
+ continue
325
+
326
+ # Process question
327
+ print("🔍 Processing your question...")
328
+ response = self.ask_question(question, max_results=5, show_citations=True)
329
+
330
+ if not response:
331
+ print("❌ Failed to get response. Please try again.")
332
+
333
+ except KeyboardInterrupt:
334
+ print("\n\n👋 Session interrupted. Goodbye!")
335
+ break
336
+ except Exception as e:
337
+ print(f"❌ Error: {e}")
338
+ continue
339
+
340
+ def _show_system_status(self):
341
+ """Display system status information."""
342
+ print("\n" + "="*50)
343
+ print("⚙️ SYSTEM STATUS")
344
+ print("="*50)
345
+
346
+ try:
347
+ # RAG engine health check
348
+ if self.rag_engine:
349
+ health = self.rag_engine.health_check()
350
+ for component, status in health.items():
351
+ status_icon = "✅" if status else "❌"
352
+ print(f" {component.replace('_', ' ').title()}: {status_icon}")
353
+
354
+ # Document statistics
355
+ if self.metadata_manager:
356
+ stats = self.metadata_manager.get_statistics()
357
+ print(f"\n📊 Document Statistics:")
358
+ print(f" Total Documents: {stats.get('total_documents', 0)}")
359
+ print(f" Total Chunks: {stats.get('total_chunks', 0)}")
360
+ print(f" Total File Size: {self._format_file_size(stats.get('total_file_size', 0))}")
361
+
362
+ # Documents by status
363
+ status_counts = stats.get('documents_by_status', {})
364
+ if status_counts:
365
+ print(f" By Status:")
366
+ for status, count in status_counts.items():
367
+ print(f" {status}: {count}")
368
+
369
+ except Exception as e:
370
+ print(f"❌ Error getting system status: {e}")
371
+
372
+ print("="*50)
373
+
374
+ def _show_loaded_documents(self):
375
+ """Display loaded documents information."""
376
+ print("\n" + "="*50)
377
+ print("📚 LOADED DOCUMENTS")
378
+ print("="*50)
379
+
380
+ try:
381
+ if self.metadata_manager:
382
+ documents = self.metadata_manager.list_documents(limit=50)
383
+
384
+ if not documents:
385
+ print("No documents loaded yet.")
386
+ return
387
+
388
+ for doc in documents:
389
+ status_icon = "✅" if doc.processing_status == ProcessingStatus.COMPLETED else "❌"
390
+ print(f" {status_icon} {doc.filename}")
391
+ print(f" Type: {doc.file_type.upper()}")
392
+ print(f" Chunks: {doc.total_chunks}")
393
+ print(f" Size: {self._format_file_size(doc.file_size)}")
394
+ print(f" Status: {doc.processing_status.value}")
395
+ if doc.error_message:
396
+ print(f" Error: {doc.error_message}")
397
+ print()
398
+
399
+ except Exception as e:
400
+ print(f"❌ Error getting document list: {e}")
401
+
402
+ print("="*50)
403
+
404
+ def _format_file_size(self, size_bytes: int) -> str:
405
+ """Format file size in human readable format."""
406
+ if size_bytes == 0:
407
+ return "0B"
408
+
409
+ size_names = ["B", "KB", "MB", "GB", "TB"]
410
+ i = 0
411
+ while size_bytes >= 1024 and i < len(size_names) - 1:
412
+ size_bytes /= 1024.0
413
+ i += 1
414
+
415
+ return f"{size_bytes:.1f}{size_names[i]}"
416
+
417
+
418
+ def main():
419
+ """Main function to run the direct RAG loader."""
420
+ print("🏭 Manufacturing RAG Agent - Direct Document Loader")
421
+ print("="*60)
422
+
423
+ # Configuration
424
+ data_directory = "data/documents/" # Change this to your documents directory
425
+ config_path = "src/config.yaml" # Change this to your config file path
426
+
427
+ # Initialize loader
428
+ loader = DirectRAGLoader(data_directory=data_directory, config_path=config_path)
429
+
430
+ try:
431
+ # Step 1: Initialize system
432
+ print("🔧 Initializing RAG system...")
433
+ if not loader.initialize_system():
434
+ print("❌ Failed to initialize RAG system. Please check your configuration and API keys.")
435
+ return
436
+
437
+ print("✅ RAG system initialized successfully!")
438
+
439
+ # Step 2: Load documents
440
+ print("📚 Loading documents...")
441
+ if not loader.load_documents():
442
+ print("❌ Failed to load documents. Please check your data directory and file formats.")
443
+ return
444
+
445
+ print("✅ Documents loaded successfully!")
446
+
447
+ # Step 3: Start interactive session
448
+ loader.interactive_qa_session()
449
+
450
+ except Exception as e:
451
+ logger.error(f"Application error: {e}")
452
+ print(f"❌ Application error: {e}")
453
+
454
+ except KeyboardInterrupt:
455
+ print("\n👋 Application interrupted. Goodbye!")
456
+
457
+
458
+ if __name__ == "__main__":
459
+ main()
docker-compose.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ qdrant:
4
+ image: qdrant/qdrant
5
+ ports:
6
+ - "6333:6333"
7
+ - "6334:6334"
8
+ volumes:
9
+ - qdrant_data:/qdrant/storage
10
+ environment:
11
+ - QDRANT__SERVICE__HTTP_PORT=6333
12
+ - QDRANT__SERVICE__GRPC_PORT=6334
13
+
14
+ volumes:
15
+ qdrant_data:
embedding_cache/17/17af3460f99996bb3e89f78743fbaa60dba90b6d8cd6da492ced29aa7c4f8bc9.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49940866843a9558d2a368c605f7b74e9e171882f5b3b2e1d43f07022fcb7f1e
3
+ size 110787
embedding_cache/cache_metadata.db ADDED
Binary file (20.5 kB). View file
 
fixed_gradio_demo.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ from pathlib import Path
5
+ import tempfile
6
+ import time
7
+ import logging
8
+ import os
9
+ import sys
10
+ import shutil
11
+ from typing import Dict, Any, Tuple, List
12
+ from datetime import datetime
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17
+
18
+ # Setup logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ try:
23
+ from src.config import Config
24
+ from src.ingestion_pipeline import DocumentIngestionPipeline
25
+ from src.rag_engine import RAGEngine
26
+ from src.metadata_manager import MetadataManager
27
+ from src.document_processor import ProcessingStatus, DocumentProcessorFactory, DocumentType
28
+ from src.pdf_processor import PDFProcessor
29
+ from src.excel_processor import ExcelProcessor
30
+ from src.image_processor import ImageProcessor
31
+
32
+ except ImportError as e:
33
+ logger.error(f"Failed to import RAG components: {e}")
34
+ print(f"❌ Import Error: {e}")
35
+ print("Please ensure all src/ modules are properly structured")
36
+ sys.exit(1)
37
+
38
+ class RAGGradioDemo:
39
+ """Fixed Gradio demo for Manufacturing RAG Agent with proper file handling."""
40
+
41
+ def __init__(self):
42
+ self.system_initialized = False
43
+ self.rag_engine = None
44
+ self.ingestion_pipeline = None
45
+ self.metadata_manager = None
46
+ self.chat_history = []
47
+
48
+ def initialize_system(self):
49
+ """Initialize the RAG system."""
50
+ try:
51
+ config_paths = [
52
+ "src/config.yaml",
53
+ "config.yaml",
54
+ os.path.join(os.path.dirname(__file__), "config.yaml"),
55
+ os.path.join(os.path.dirname(os.path.dirname(__file__)), "src", "config.yaml")
56
+ ]
57
+
58
+ config_path = None
59
+ for path in config_paths:
60
+ if os.path.exists(path):
61
+ config_path = path
62
+ break
63
+
64
+ if not config_path:
65
+ return "❌ Configuration file not found. Please ensure src/config.yaml exists."
66
+
67
+ logger.info(f"Using config file: {config_path}")
68
+
69
+ # Load configuration
70
+ config = Config(config_path)
71
+
72
+ # Validate API keys
73
+ if not config.groq_api_key:
74
+ return "❌ Missing GROQ_API_KEY in environment variables"
75
+ if not config.siliconflow_api_key:
76
+ return "❌ Missing SILICONFLOW_API_KEY in environment variables"
77
+ if not config.qdrant_url:
78
+ return "❌ Missing QDRANT_URL in environment variables"
79
+
80
+ # Create configuration dictionary
81
+ rag_config = config.rag_config
82
+ config_dict = {
83
+ 'siliconflow_api_key': config.siliconflow_api_key,
84
+ 'groq_api_key': config.groq_api_key,
85
+ 'qdrant_url': config.qdrant_url,
86
+ 'qdrant_api_key': config.qdrant_api_key,
87
+ 'qdrant_collection': 'manufacturing_docs',
88
+ 'embedding_model': rag_config.get('embedding_model', 'Qwen/Qwen3-Embedding-8B'),
89
+ 'reranker_model': rag_config.get('reranker_model', 'Qwen/Qwen3-Reranker-8B'),
90
+ 'llm_model': rag_config.get('llm_model', 'openai/gpt-oss-120b'),
91
+ 'vector_size': 1024, # Updated to match Qwen/Qwen3-Embedding-8B actual dimensions
92
+ 'max_context_chunks': rag_config.get('max_context_chunks', 5),
93
+ 'similarity_threshold': rag_config.get('similarity_threshold', 0.7),
94
+ 'chunk_size': rag_config.get('chunk_size', 512),
95
+ 'chunk_overlap': rag_config.get('chunk_overlap', 50),
96
+ 'metadata_db_path': './data/metadata.db',
97
+ 'max_retries': 3,
98
+ 'rerank_top_k': 20,
99
+ 'final_top_k': 5
100
+ }
101
+
102
+ # Register processors
103
+ DocumentProcessorFactory.register_processor(DocumentType.PDF, PDFProcessor)
104
+ DocumentProcessorFactory.register_processor(DocumentType.EXCEL, ExcelProcessor)
105
+ DocumentProcessorFactory.register_processor(DocumentType.IMAGE, ImageProcessor)
106
+
107
+ # Initialize components
108
+ self.metadata_manager = MetadataManager(config_dict)
109
+ self.ingestion_pipeline = DocumentIngestionPipeline(config_dict)
110
+ self.rag_engine = RAGEngine(config_dict)
111
+
112
+ self.system_initialized = True
113
+ return "✅ System initialized successfully!"
114
+
115
+ except Exception as e:
116
+ logger.error(f"Initialization failed: {e}")
117
+ return f"❌ Initialization failed: {str(e)}"
118
+
119
+ def process_files(self, files):
120
+ if not self.system_initialized:
121
+ return "❌ System not initialized", pd.DataFrame()
122
+
123
+ if not files:
124
+ return "No files uploaded", pd.DataFrame()
125
+
126
+ results = []
127
+
128
+ for i, file_obj in enumerate(files):
129
+ try:
130
+ logger.info(f"Processing file {i+1}/{len(files)}: {file_obj}")
131
+
132
+ # Handle different types of file objects from Gradio
133
+ file_path = None
134
+ temp_path = None
135
+
136
+ # Check if file_obj is a path string
137
+ if isinstance(file_obj, str):
138
+ file_path = file_obj
139
+ filename = os.path.basename(file_path)
140
+ # Check if it's a file-like object with a name
141
+ elif hasattr(file_obj, 'name'):
142
+ file_path = file_obj.name
143
+ filename = os.path.basename(file_path)
144
+ # Check if it's a tuple/list (Gradio sometimes returns tuples)
145
+ elif isinstance(file_obj, (tuple, list)) and len(file_obj) > 0:
146
+ file_path = file_obj[0] if isinstance(file_obj[0], str) else file_obj[0].name
147
+ filename = os.path.basename(file_path)
148
+ else:
149
+ logger.error(f"Unknown file object type: {type(file_obj)}")
150
+ results.append({
151
+ 'Filename': f'Unknown file {i+1}',
152
+ 'Status': '❌ Failed',
153
+ 'Chunks': 0,
154
+ 'Time': '0.00s',
155
+ 'Error': 'Unknown file object type'
156
+ })
157
+ continue
158
+
159
+ if not file_path or not os.path.exists(file_path):
160
+ logger.error(f"File path does not exist: {file_path}")
161
+ results.append({
162
+ 'Filename': filename if 'filename' in locals() else f'File {i+1}',
163
+ 'Status': '❌ Failed',
164
+ 'Chunks': 0,
165
+ 'Time': '0.00s',
166
+ 'Error': 'File path not found'
167
+ })
168
+ continue
169
+
170
+ logger.info(f"Processing file: {filename} from path: {file_path}")
171
+
172
+ # Create a temporary copy if needed (to avoid issues with Gradio's temp files)
173
+ suffix = Path(filename).suffix
174
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
175
+ shutil.copy2(file_path, tmp.name)
176
+ temp_path = tmp.name
177
+
178
+ # Process the document
179
+ start_time = time.time()
180
+ result = self.ingestion_pipeline.ingest_document(temp_path)
181
+ processing_time = time.time() - start_time
182
+
183
+ results.append({
184
+ 'Filename': filename,
185
+ 'Status': '✅ Success' if result.success else '❌ Failed',
186
+ 'Chunks': result.chunks_indexed if result.success else 0,
187
+ 'Time': f"{processing_time:.2f}s",
188
+ 'Error': result.error_message if not result.success else 'None'
189
+ })
190
+
191
+ logger.info(f"{'Success' if result.success else 'Failed'}: {filename}")
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error processing file {i+1}: {e}")
195
+ results.append({
196
+ 'Filename': f'File {i+1}',
197
+ 'Status': '❌ Failed',
198
+ 'Chunks': 0,
199
+ 'Time': '0.00s',
200
+ 'Error': str(e)
201
+ })
202
+
203
+ finally:
204
+ # Clean up temp file
205
+ if temp_path and os.path.exists(temp_path):
206
+ try:
207
+ os.unlink(temp_path)
208
+ except Exception as e:
209
+ logger.warning(f"Failed to clean temp file: {e}")
210
+
211
+ # Create summary
212
+ successful = sum(1 for r in results if 'Success' in r['Status'])
213
+ total_chunks = sum(r['Chunks'] for r in results if isinstance(r['Chunks'], int))
214
+
215
+ status = f"✅ Processed {successful}/{len(results)} files successfully. Total chunks: {total_chunks}"
216
+
217
+ return status, pd.DataFrame(results)
218
+
219
+ def ask_question(self, question, max_results=5, threshold=0.7):
220
+ """Ask a question to the RAG system."""
221
+ if not self.system_initialized:
222
+ return "❌ System not initialized", "", pd.DataFrame()
223
+
224
+ if not question.strip():
225
+ return "Please enter a question", "", pd.DataFrame()
226
+
227
+ try:
228
+ # Check for documents
229
+ docs = self.metadata_manager.list_documents(status=ProcessingStatus.COMPLETED, limit=1)
230
+ if not docs:
231
+ return "⚠️ No processed documents available. Please upload documents first.", "", pd.DataFrame()
232
+
233
+ # Update RAG settings temporarily
234
+ original_final_top_k = self.rag_engine.final_top_k
235
+ original_threshold = self.rag_engine.similarity_threshold
236
+
237
+ self.rag_engine.final_top_k = max_results
238
+ self.rag_engine.similarity_threshold = threshold
239
+
240
+ # Get answer
241
+ logger.info(f"Processing question: {question[:50]}...")
242
+ response = self.rag_engine.answer_question(question)
243
+
244
+ # Restore settings
245
+ self.rag_engine.final_top_k = original_final_top_k
246
+ self.rag_engine.similarity_threshold = original_threshold
247
+
248
+ if not response.success:
249
+ return f"❌ {response.error_message}", "", pd.DataFrame()
250
+
251
+ # Format citations
252
+ citations = "## 📚 Sources & Citations\n\n"
253
+ for i, citation in enumerate(response.citations):
254
+ citations += f"**{i+1}.** {citation.source_file}\n"
255
+ if citation.page_number:
256
+ citations += f"📄 Page {citation.page_number}\n"
257
+ if citation.worksheet_name:
258
+ citations += f"📊 Sheet: {citation.worksheet_name}\n"
259
+ citations += f"*Excerpt:* \"{citation.text_snippet[:100]}...\"\n\n"
260
+
261
+ # Performance metrics
262
+ metrics = pd.DataFrame({
263
+ 'Metric': ['Confidence Score', 'Processing Time (s)', 'Sources Used', 'Chunks Retrieved'],
264
+ 'Value': [
265
+ f"{response.confidence_score:.3f}",
266
+ f"{response.processing_time:.2f}",
267
+ len(response.citations),
268
+ response.total_chunks_retrieved
269
+ ]
270
+ })
271
+
272
+ return response.answer, citations, metrics
273
+
274
+ except Exception as e:
275
+ logger.error(f"Question processing failed: {e}")
276
+ return f"❌ Error: {str(e)}", "", pd.DataFrame()
277
+
278
+ def get_document_library(self):
279
+ """Get list of processed documents."""
280
+ if not self.system_initialized:
281
+ return pd.DataFrame({'Message': ['System not initialized']})
282
+
283
+ try:
284
+ documents = self.metadata_manager.list_documents(limit=50)
285
+ if not documents:
286
+ return pd.DataFrame({'Message': ['No documents processed yet']})
287
+
288
+ doc_data = []
289
+ for doc in documents:
290
+ doc_data.append({
291
+ 'Filename': doc.filename,
292
+ 'Type': doc.file_type.upper(),
293
+ 'Status': doc.processing_status.value.title(),
294
+ 'Chunks': doc.total_chunks,
295
+ 'Size': self._format_size(doc.file_size),
296
+ 'Uploaded': doc.upload_timestamp.strftime('%Y-%m-%d %H:%M')
297
+ })
298
+
299
+ return pd.DataFrame(doc_data)
300
+
301
+ except Exception as e:
302
+ logger.error(f"Failed to get document library: {e}")
303
+ return pd.DataFrame({'Error': [str(e)]})
304
+
305
+ def _format_size(self, size_bytes):
306
+ """Format file size."""
307
+ if size_bytes == 0:
308
+ return "0B"
309
+
310
+ size_names = ["B", "KB", "MB", "GB"]
311
+ i = 0
312
+ while size_bytes >= 1024 and i < len(size_names) - 1:
313
+ size_bytes /= 1024.0
314
+ i += 1
315
+
316
+ return f"{size_bytes:.1f}{size_names[i]}"
317
+
318
+ def create_interface():
319
+ """Create the Gradio interface."""
320
+ demo = RAGGradioDemo()
321
+
322
+ with gr.Blocks(title="Manufacturing RAG Agent", theme=gr.themes.Soft()) as app:
323
+ gr.Markdown("""
324
+ # 🏭 Manufacturing RAG Agent
325
+ *Upload documents and ask questions about manufacturing data*
326
+
327
+ **Supports:** PDF, Excel (.xlsx, .xls), Images (.png, .jpg, .jpeg)
328
+ """)
329
+
330
+ # System initialization
331
+ with gr.Row():
332
+ init_btn = gr.Button("🚀 Initialize System", variant="primary")
333
+ status_display = gr.Textbox("System not initialized", label="System Status", interactive=False)
334
+
335
+ with gr.Tabs():
336
+ # Document Upload Tab
337
+ with gr.TabItem("📄 Document Upload"):
338
+ gr.Markdown("### Upload and Process Documents")
339
+
340
+ with gr.Column():
341
+ file_input = gr.File(
342
+ file_count="multiple",
343
+ file_types=[".pdf", ".xlsx", ".xls", ".xlsm", ".png", ".jpg", ".jpeg"],
344
+ label="Upload Documents"
345
+ )
346
+ upload_btn = gr.Button("🔄 Process Documents", variant="primary")
347
+
348
+ upload_status = gr.Textbox(
349
+ label="Processing Status",
350
+ interactive=False,
351
+ lines=2
352
+ )
353
+
354
+ upload_results = gr.Dataframe(
355
+ label="Processing Results",
356
+ interactive=False
357
+ )
358
+
359
+ gr.Markdown("### 📚 Document Library")
360
+ refresh_btn = gr.Button("🔄 Refresh Library")
361
+ doc_library = gr.Dataframe(
362
+ label="Processed Documents",
363
+ interactive=False
364
+ )
365
+
366
+ # Question Answering Tab
367
+ with gr.TabItem("❓ Ask Questions"):
368
+ gr.Markdown("### Ask Questions About Your Documents")
369
+
370
+ with gr.Row():
371
+ with gr.Column(scale=2):
372
+ question_input = gr.Textbox(
373
+ label="Your Question",
374
+ placeholder="e.g., What is the production yield mentioned in the documents?",
375
+ lines=3
376
+ )
377
+ ask_btn = gr.Button("🔍 Ask Question", variant="primary")
378
+
379
+ with gr.Column(scale=1):
380
+ gr.Markdown("#### Settings")
381
+ max_results = gr.Slider(
382
+ minimum=1, maximum=10, value=5, step=1,
383
+ label="Max Context Chunks"
384
+ )
385
+ similarity_threshold = gr.Slider(
386
+ minimum=0.0, maximum=1.0, value=0.7, step=0.05,
387
+ label="Similarity Threshold"
388
+ )
389
+
390
+ # Answer display
391
+ answer_output = gr.Markdown(label="Answer")
392
+ citations_output = gr.Markdown(label="Citations")
393
+ performance_metrics = gr.Dataframe(
394
+ label="Performance Metrics",
395
+ interactive=False
396
+ )
397
+
398
+ # Event handlers
399
+ init_btn.click(
400
+ demo.initialize_system,
401
+ outputs=[status_display]
402
+ )
403
+
404
+ upload_btn.click(
405
+ demo.process_files,
406
+ inputs=[file_input],
407
+ outputs=[upload_status, upload_results]
408
+ )
409
+
410
+ ask_btn.click(
411
+ demo.ask_question,
412
+ inputs=[question_input, max_results, similarity_threshold],
413
+ outputs=[answer_output, citations_output, performance_metrics]
414
+ )
415
+
416
+ refresh_btn.click(
417
+ demo.get_document_library,
418
+ outputs=[doc_library]
419
+ )
420
+
421
+ # Auto-refresh library after upload
422
+ upload_btn.click(
423
+ demo.get_document_library,
424
+ outputs=[doc_library]
425
+ )
426
+
427
+ return app
428
+
429
+
430
+ def main():
431
+ """Launch the application."""
432
+ try:
433
+ # Create necessary directories
434
+ os.makedirs("data", exist_ok=True)
435
+ os.makedirs("logs", exist_ok=True)
436
+
437
+ # Create interface
438
+ app = create_interface()
439
+
440
+ # Launch
441
+ print("🏭 Launching Manufacturing RAG Agent...")
442
+ print("📱 Interface will be available at: http://localhost:7860")
443
+ print("🛑 Press Ctrl+C to stop")
444
+
445
+ app.launch(
446
+ server_name="0.0.0.0",
447
+ server_port=7860,
448
+ share=True,
449
+ debug=True,
450
+ show_error=True
451
+ )
452
+
453
+ except Exception as e:
454
+ print(f"❌ Failed to launch: {e}")
455
+
456
+ if __name__ == "__main__":
457
+ main()
launch_rag_demo.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ def main():
6
+ """Launch the Streamlit demo application."""
7
+ # Get the directory containing this script
8
+ script_dir = Path(__file__).parent
9
+ demo_path = script_dir / "src" / "rag_demo.py"
10
+
11
+ if not demo_path.exists():
12
+ print(f"Error: Demo file not found at {demo_path}")
13
+ sys.exit(1)
14
+
15
+ # Launch Streamlit
16
+ try:
17
+ print("🏭 Launching Manufacturing RAG Agent Demo...")
18
+ print(f"📁 Demo path: {demo_path}")
19
+ print("🌐 The demo will open in your default web browser")
20
+ print("🛑 Press Ctrl+C to stop the demo")
21
+ print("-" * 50)
22
+
23
+ subprocess.run([
24
+ sys.executable, "-m", "streamlit", "run", str(demo_path),
25
+ "--server.port", "8501",
26
+ "--server.address", "localhost",
27
+ "--browser.gatherUsageStats", "false"
28
+ ])
29
+
30
+ except KeyboardInterrupt:
31
+ print("\n🛑 Demo stopped by user")
32
+ except Exception as e:
33
+ print(f"❌ Failed to launch demo: {e}")
34
+ sys.exit(1)
35
+
36
+ if __name__ == "__main__":
37
+ main()
logger/custom_logger.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import yaml
4
+ from datetime import datetime
5
+ import sys
6
+ from typing import List
7
+
8
+ class CustomLoggerTracker:
9
+ _instance = None
10
+ _initialized = False
11
+
12
+ def __new__(cls, config_path='logging_config.yaml'):
13
+ """Singleton pattern to ensure only one instance exists."""
14
+ if cls._instance is None:
15
+ cls._instance = super(CustomLoggerTracker, cls).__new__(cls)
16
+ return cls._instance
17
+
18
+ def __init__(self, config_path='logging_config.yaml'):
19
+ """Initialize the custom logger with configuration."""
20
+ if self._initialized:
21
+ return
22
+
23
+ self.config = self._load_config(config_path)
24
+ self.loggers = {}
25
+ self.base_log_dir = self.config.get('base_log_dir', 'logs')
26
+ self._setup_base_directory()
27
+ self._initialized = True
28
+
29
+ def _load_config(self, config_path):
30
+ """Load configuration from YAML file."""
31
+ try:
32
+ with open(config_path, 'r') as file:
33
+ return yaml.safe_load(file)
34
+ except FileNotFoundError:
35
+ # Default configuration if file not found
36
+ return {
37
+ 'base_log_dir': 'logs',
38
+ 'default_level': 'INFO',
39
+ 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
40
+ 'console_output': True,
41
+ 'modules': {
42
+ 'main': {'level': 'INFO'},
43
+ 'utils': {'level': 'INFO'},
44
+ 'old_docs': {'level': 'INFO'},
45
+ 'rag': {'level': 'INFO'},
46
+ 'query_utils': {'level': 'INFO'},
47
+ 'prompt_temp': {'level': 'INFO'}
48
+ }
49
+ }
50
+
51
+ def _setup_base_directory(self):
52
+ """Setup the base directory structure for logs."""
53
+ if not os.path.exists(self.base_log_dir):
54
+ os.makedirs(self.base_log_dir)
55
+
56
+ def _get_log_path(self, module_name):
57
+ """Generate the hierarchical path for log files."""
58
+ now = datetime.now()
59
+ year_dir = os.path.join(self.base_log_dir, str(now.year))
60
+ month_dir = os.path.join(year_dir, f"{now.month:02d}")
61
+ day_dir = os.path.join(month_dir, f"{now.day:02d}")
62
+ os.makedirs(day_dir, exist_ok=True)
63
+ return os.path.join(day_dir, f"{module_name}.log")
64
+
65
+ def get_logger(self, module_name):
66
+ """Get or create a logger for a specific module."""
67
+ if module_name in self.loggers:
68
+ return self.loggers[module_name]
69
+
70
+ # Create new logger & Models Specific Config
71
+ logger = logging.getLogger(module_name)
72
+ module_config = self.config['modules'].get(module_name, {})
73
+ level = getattr(logging, module_config.get('level', self.config['default_level']))
74
+ logger.setLevel(level)
75
+
76
+ # Clear existing handlers to avoid duplicates
77
+ logger.handlers.clear()
78
+
79
+ # Create formatter
80
+ formatter = logging.Formatter(self.config.get('format'))
81
+
82
+ # Create file handler with the hierarchical path
83
+ log_path = self._get_log_path(module_name)
84
+ file_handler = logging.FileHandler(log_path)
85
+ file_handler.setFormatter(formatter)
86
+ logger.addHandler(file_handler)
87
+
88
+ # Optionally add console handler
89
+ if self.config.get('console_output', True):
90
+ console_handler = logging.StreamHandler()
91
+ console_handler.setFormatter(formatter)
92
+ logger.addHandler(console_handler)
93
+
94
+ # Prevent propagation to avoid duplicate messages
95
+ logger.propagate = False
96
+
97
+ self.loggers[module_name] = logger
98
+ return logger
99
+
100
+ def update_config(self, new_config):
101
+ """Update logger configuration."""
102
+ self.config.update(new_config)
103
+ # Reset all loggers to apply new configuration
104
+ for module_name in self.loggers:
105
+ logger = self.loggers[module_name]
106
+ for handler in logger.handlers[:]:
107
+ logger.removeHandler(handler)
108
+ self.loggers = {}
109
+
110
+ def log_message(self, process_log: List[str], message: str, level: str = "info", module: str = "default") -> None:
111
+ """
112
+ Append to process_log AND send to the central logger.
113
+
114
+ Args:
115
+ process_log: List to append the message to
116
+ message: The message to log
117
+ level: Log level ('info', 'warning', 'error')
118
+ module: Module name for the logger (optional, defaults to 'default')
119
+ """
120
+ process_log.append(message)
121
+
122
+ # Get the logger for the specified module
123
+ logger = self.get_logger(module)
124
+
125
+ # Log the message at the appropriate level
126
+ if level.lower() == "error":
127
+ logger.error(message)
128
+ elif level.lower() == "warning":
129
+ logger.warning(message)
130
+ else:
131
+ logger.info(message)
132
+
133
+ def log_info(self, message: str, module: str = "default") -> None:
134
+ """Log an info message."""
135
+ logger = self.get_logger(module)
136
+ logger.info(message)
137
+
138
+ def log_warning(self, message: str, module: str = "default") -> None:
139
+ """Log a warning message."""
140
+ logger = self.get_logger(module)
141
+ logger.warning(message)
142
+
143
+ def log_error(self, message: str, module: str = "default") -> None:
144
+ """Log an error message."""
145
+ logger = self.get_logger(module)
146
+ logger.error(message)
147
+
148
+ # Alternative method names that match your original _log function pattern
149
+ def _log(self, process_log: List[str], message: str, level: str = "info", module: str = "default") -> None:
150
+ """Alias for log_message to match your original function name."""
151
+ self.log_message(process_log, message, level, module)
152
+
153
+
154
+ # Create a default instance for easy importing
155
+ default_logger = CustomLoggerTracker()
156
+
157
+ # Expose the methods at module level for easy importing
158
+ log_message = default_logger.log_message
159
+ log_info = default_logger.log_info
160
+ log_warning = default_logger.log_warning
161
+ log_error = default_logger.log_error
162
+ _log = default_logger._log
163
+
164
+
165
+ # Example usage
166
+ if __name__ == "__main__":
167
+ # Method 1: Create your own instance
168
+ logger_tracker = CustomLoggerTracker()
169
+ process_log = []
170
+
171
+ logger_tracker.log_message(process_log, "This is a test info message", "info", "registration")
172
+ logger_tracker.log_message(process_log, "This is a warning message", "warning", "registration")
173
+ logger_tracker.log_message(process_log, "This is an error message", "error", "registration")
174
+
175
+ # Method 2: Use the default instance functions
176
+ process_log2 = []
177
+ log_message(process_log2, "Using default logger", "info", "detection")
178
+ _log(process_log2, "Using _log alias", "warning", "detection")
179
+
180
+ # Method 3: Direct logging without process_log
181
+ log_info("Direct info message", "main")
182
+ log_warning("Direct warning message", "main")
183
+ log_error("Direct error message", "main")
184
+
185
+ print("Process log 1 contents:")
186
+ for log_entry in process_log:
187
+ print(f" {log_entry}")
188
+
189
+ print("Process log 2 contents:")
190
+ for log_entry in process_log2:
191
+ print(f" {log_entry}")
logger/logging_config.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_log_dir: logs
2
+ default_level: INFO
3
+ console_output: true
4
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
5
+
6
+ modules:
7
+ main:
8
+ level: DEBUG
9
+
10
+ utils:
11
+ level: DEBUG
12
+
13
+ old_docs:
14
+ level: DEBUG
15
+
16
+ prompt_template:
17
+ level: DEBUG
18
+
19
+ rag:
20
+ level: DEBUG
21
+
22
+
23
+ rag_utils:
24
+ level: DEBUG
25
+
26
+ rag_steps:
27
+ level: DEBUG
28
+
29
+
30
+
quick_start.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick start script for Manufacturing RAG Agent.
4
+
5
+ This script provides a simple way to test the RAG system without the full Streamlit interface.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+ from dotenv import load_dotenv
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
+
16
+ def check_setup():
17
+ """Check if the system is properly set up."""
18
+ required_keys = ['GROQ_API_KEY', 'SILICONFLOW_API_KEY']
19
+ missing_keys = []
20
+
21
+ for key in required_keys:
22
+ if not os.getenv(key):
23
+ missing_keys.append(key)
24
+
25
+ if missing_keys:
26
+ print(f"❌ Missing required environment variables: {', '.join(missing_keys)}")
27
+ print("📝 Please set them in your .env file")
28
+ return False
29
+
30
+ return True
31
+
32
+ def test_document_processing():
33
+ """Test document processing with a simple example."""
34
+ print("🧪 Testing document processing...")
35
+
36
+ try:
37
+ from src.document_processor import DocumentProcessorFactory
38
+ from src.pdf_processor import PDFProcessor
39
+ from src.excel_processor import ExcelProcessor
40
+ from src.image_processor import ImageProcessor
41
+
42
+ # Test processor factory
43
+ config = {
44
+ 'max_file_size_mb': 10,
45
+ 'image_processing': True,
46
+ 'table_extraction': True
47
+ }
48
+
49
+ # Register processors
50
+ from src.document_processor import DocumentType
51
+ DocumentProcessorFactory.register_processor(DocumentType.PDF, PDFProcessor)
52
+ DocumentProcessorFactory.register_processor(DocumentType.EXCEL, ExcelProcessor)
53
+ DocumentProcessorFactory.register_processor(DocumentType.IMAGE, ImageProcessor)
54
+
55
+ print("✅ Document processors initialized")
56
+ return True
57
+
58
+ except Exception as e:
59
+ print(f"❌ Document processing test failed: {e}")
60
+ return False
61
+
62
+ def test_embedding_system():
63
+ """Test embedding system."""
64
+ print("🧪 Testing embedding system...")
65
+
66
+ try:
67
+ from src.embedding_system import EmbeddingSystem
68
+
69
+ config = {
70
+ 'siliconflow_api_key': os.getenv('SILICONFLOW_API_KEY'),
71
+ 'embedding_model': 'BAAI/bge-large-zh-v1.5',
72
+ 'reranker_model': 'BAAI/bge-reranker-large',
73
+ 'batch_size': 2,
74
+ 'max_retries': 2,
75
+ 'enable_embedding_cache': True
76
+ }
77
+
78
+ embedding_system = EmbeddingSystem(config)
79
+ print("✅ Embedding system initialized")
80
+
81
+ # Test with simple text
82
+ test_texts = ["Manufacturing process efficiency", "Quality control metrics"]
83
+ print("🔄 Generating test embeddings...")
84
+
85
+ # Note: This will make an actual API call
86
+ embeddings = embedding_system.generate_embeddings(test_texts)
87
+
88
+ if embeddings and len(embeddings) == 2:
89
+ print(f"✅ Generated embeddings: {len(embeddings)} vectors of size {len(embeddings[0])}")
90
+ return True
91
+ else:
92
+ print("❌ Embedding generation failed")
93
+ return False
94
+
95
+ except Exception as e:
96
+ print(f"❌ Embedding system test failed: {e}")
97
+ return False
98
+
99
+ def test_llm_system():
100
+ """Test LLM system."""
101
+ print("🧪 Testing LLM system...")
102
+
103
+ try:
104
+ from src.groq_client import LLMSystem
105
+
106
+ config = {
107
+ 'groq_api_key': os.getenv('GROQ_API_KEY'),
108
+ 'llm_model': 'llama-3.1-8b-instant',
109
+ 'max_retries': 2
110
+ }
111
+
112
+ llm_system = LLMSystem(config)
113
+ print("✅ LLM system initialized")
114
+
115
+ # Test with simple question
116
+ print("🔄 Testing question answering...")
117
+ context = "The manufacturing yield for Q3 was 95.2% with a total production of 10,000 units."
118
+ question = "What was the manufacturing yield for Q3?"
119
+
120
+ # Note: This will make an actual API call
121
+ answer = llm_system.answer_question(question, context)
122
+
123
+ if answer and "95.2%" in answer:
124
+ print(f"✅ LLM response: {answer[:100]}...")
125
+ return True
126
+ else:
127
+ print(f"❌ Unexpected LLM response: {answer}")
128
+ return False
129
+
130
+ except Exception as e:
131
+ print(f"❌ LLM system test failed: {e}")
132
+ return False
133
+
134
+ def test_qdrant_connection():
135
+ """Test Qdrant connection."""
136
+ print("🧪 Testing Qdrant connection...")
137
+
138
+ try:
139
+ from src.vector_store import QdrantVectorStore
140
+
141
+ config = {
142
+ 'qdrant_url': os.getenv('QDRANT_URL', 'http://localhost:6333'),
143
+ 'qdrant_api_key': os.getenv('QDRANT_API_KEY'),
144
+ 'qdrant_collection': 'test_collection',
145
+ 'vector_size': 1024
146
+ }
147
+
148
+ vector_store = QdrantVectorStore(config)
149
+
150
+ # Test health check
151
+ if vector_store.health_check():
152
+ print("✅ Qdrant connection successful")
153
+
154
+ # Get collection info
155
+ info = vector_store.get_collection_info()
156
+ if info:
157
+ print(f"✅ Collection info: {info.total_points} points")
158
+
159
+ return True
160
+ else:
161
+ print("❌ Qdrant health check failed")
162
+ return False
163
+
164
+ except Exception as e:
165
+ print(f"❌ Qdrant connection test failed: {e}")
166
+ return False
167
+
168
+ def run_simple_demo():
169
+ """Run a simple demo of the RAG system."""
170
+ print("\n🚀 Running Simple RAG Demo")
171
+ print("=" * 40)
172
+
173
+ try:
174
+ # Initialize components
175
+ config = {
176
+ 'siliconflow_api_key': os.getenv('SILICONFLOW_API_KEY'),
177
+ 'groq_api_key': os.getenv('GROQ_API_KEY'),
178
+ 'qdrant_url': os.getenv('QDRANT_URL', 'http://localhost:6333'),
179
+ 'qdrant_api_key': os.getenv('QDRANT_API_KEY'),
180
+ 'qdrant_collection': 'demo_collection',
181
+ 'embedding_model': 'BAAI/bge-large-zh-v1.5',
182
+ 'reranker_model': 'BAAI/bge-reranker-large',
183
+ 'llm_model': 'llama-3.1-8b-instant',
184
+ 'max_context_chunks': 3,
185
+ 'similarity_threshold': 0.7,
186
+ 'rerank_top_k': 10,
187
+ 'final_top_k': 3,
188
+ 'vector_size': 1024
189
+ }
190
+
191
+ print("🔄 Initializing RAG engine...")
192
+ from src.rag_engine import RAGEngine
193
+ rag_engine = RAGEngine(config)
194
+
195
+ print("✅ RAG engine initialized successfully!")
196
+ print("\n💡 The system is ready. You can now:")
197
+ print(" 1. Run the full demo: python launch_rag_demo.py")
198
+ print(" 2. Upload documents and ask questions")
199
+ print(" 3. View analytics and system status")
200
+
201
+ return True
202
+
203
+ except Exception as e:
204
+ print(f"❌ RAG demo failed: {e}")
205
+ return False
206
+
207
+ def main():
208
+ """Main function."""
209
+ print("🏭 Manufacturing RAG Agent - Quick Start")
210
+ print("=" * 50)
211
+
212
+ # Check setup
213
+ if not check_setup():
214
+ print("\n📝 Setup Instructions:")
215
+ print("1. Copy .env.example to .env")
216
+ print("2. Add your API keys:")
217
+ print(" - GROQ_API_KEY from https://console.groq.com/")
218
+ print(" - SILICONFLOW_API_KEY from https://siliconflow.cn/")
219
+ print("3. Set up Qdrant:")
220
+ print(" - Local: docker run -p 6333:6333 qdrant/qdrant")
221
+ print(" - Cloud: https://cloud.qdrant.io/")
222
+ return
223
+
224
+ print("✅ Environment variables configured")
225
+
226
+ # Run tests
227
+ tests = [
228
+ ("Document Processing", test_document_processing),
229
+ ("Qdrant Connection", test_qdrant_connection),
230
+ ("Embedding System", test_embedding_system),
231
+ ("LLM System", test_llm_system),
232
+ ]
233
+
234
+ results = {}
235
+ for test_name, test_func in tests:
236
+ print(f"\n{'='*20}")
237
+ try:
238
+ results[test_name] = test_func()
239
+ except Exception as e:
240
+ print(f"❌ {test_name} failed with error: {e}")
241
+ results[test_name] = False
242
+
243
+ # Summary
244
+ print(f"\n{'='*50}")
245
+ print("📋 Test Results Summary:")
246
+ for test_name, result in results.items():
247
+ status = "✅ PASS" if result else "❌ FAIL"
248
+ print(f" {test_name}: {status}")
249
+
250
+ # Run demo if all critical tests pass
251
+ critical_tests = ["Qdrant Connection", "Embedding System", "LLM System"]
252
+ if all(results.get(test, False) for test in critical_tests):
253
+ run_simple_demo()
254
+ else:
255
+ print("\n⚠️ Some critical tests failed. Please fix the issues above.")
256
+ print("💡 Common solutions:")
257
+ print(" - Check your API keys in .env file")
258
+ print(" - Ensure Qdrant is running (local or cloud)")
259
+ print(" - Install missing dependencies: pip install -r requirements.txt")
260
+
261
+ if __name__ == "__main__":
262
+ main()
rag_launcher.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from pathlib import Path
6
+ import tempfile
7
+ import time
8
+ import json
9
+ import logging
10
+ import os
11
+ import sys
12
+ from typing import Dict, Any, Tuple, List
13
+ from datetime import datetime
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+
17
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
+
19
+ # Setup logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ try:
24
+ from src.config import Config
25
+ from src.ingestion_pipeline import DocumentIngestionPipeline, IngestionResult
26
+ from src.rag_engine import RAGEngine, RAGResponse
27
+ from src.metadata_manager import MetadataManager
28
+ from src.document_processor import ProcessingStatus, DocumentProcessorFactory, DocumentType
29
+ from src.pdf_processor import PDFProcessor
30
+ from src.excel_processor import ExcelProcessor
31
+ from src.image_processor import ImageProcessor
32
+
33
+ except ImportError as e:
34
+ logger.error(f"Failed to import RAG components: {e}")
35
+ print(f"❌ Import Error: {e}")
36
+ print("Please ensure all src/ modules are properly structured and dependencies are installed")
37
+ sys.exit(1)
38
+
39
+
40
+
41
+
42
+ class RAGGradioDemo:
43
+ """Fixed Gradio demo application for the Manufacturing RAG Agent."""
44
+
45
+ def __init__(self):
46
+ """Initialize the RAG demo application."""
47
+ self.config = None
48
+ self.ingestion_pipeline = None
49
+ self.rag_engine = None
50
+ self.metadata_manager = None
51
+
52
+ # Initialize session state tracking
53
+ self.system_initialized = False
54
+ self.documents = []
55
+ self.chat_history = []
56
+
57
+ def initialize_system(self) -> Tuple[bool, str]:
58
+ """Initialize the RAG system components with better error handling."""
59
+ try:
60
+ # Find config file
61
+ config_paths = [
62
+ "src/config.yaml",
63
+ "config.yaml",
64
+ os.path.join(os.path.dirname(__file__), "config.yaml"),
65
+ os.path.join(os.path.dirname(os.path.dirname(__file__)), "src", "config.yaml")
66
+ ]
67
+
68
+ config_path = None
69
+ for path in config_paths:
70
+ if os.path.exists(path):
71
+ config_path = path
72
+ break
73
+
74
+ if not config_path:
75
+ return False, f"Configuration file not found. Searched: {config_paths}"
76
+
77
+ logger.info(f"Using config file: {config_path}")
78
+
79
+ # Load configuration
80
+ self.config = Config(config_path)
81
+
82
+ # Validate API keys
83
+ required_keys = {
84
+ 'GROQ_API_KEY': self.config.groq_api_key,
85
+ 'SILICONFLOW_API_KEY': self.config.siliconflow_api_key,
86
+ 'QDRANT_URL': self.config.qdrant_url
87
+ }
88
+
89
+ missing_keys = [k for k, v in required_keys.items() if not v]
90
+ if missing_keys:
91
+ return False, f"Missing required environment variables: {', '.join(missing_keys)}"
92
+
93
+ # Create config dictionary using your config structure
94
+ rag_config = self.config.rag_config
95
+
96
+ config_dict = {
97
+ # API keys
98
+ 'siliconflow_api_key': self.config.siliconflow_api_key,
99
+ 'groq_api_key': self.config.groq_api_key,
100
+
101
+ # Qdrant configuration
102
+ 'qdrant_url': self.config.qdrant_url,
103
+ 'qdrant_api_key': self.config.qdrant_api_key,
104
+ 'qdrant_collection': 'manufacturing_docs',
105
+
106
+ # Model configuration from your config.yaml
107
+ 'embedding_model': rag_config.get('embedding_model', 'Qwen/Qwen3-Embedding-8B'),
108
+ 'reranker_model': rag_config.get('reranker_model', 'Qwen/Qwen3-Reranker-8B'),
109
+ 'llm_model': rag_config.get('llm_model', 'openai/gpt-oss-120b'),
110
+
111
+ # Vector configuration
112
+ 'vector_size': 1024, # Adjust based on your embedding model
113
+
114
+ # RAG parameters from your config
115
+ 'max_context_chunks': rag_config.get('max_context_chunks', 5),
116
+ 'similarity_threshold': rag_config.get('similarity_threshold', 0.7),
117
+ 'rerank_top_k': rag_config.get('rerank_top_k', 20),
118
+ 'final_top_k': rag_config.get('final_top_k', 5),
119
+
120
+ # Text processing
121
+ 'chunk_size': rag_config.get('chunk_size', 512),
122
+ 'chunk_overlap': rag_config.get('chunk_overlap', 50),
123
+ 'max_context_length': 4000,
124
+
125
+ # Document processing
126
+ 'image_processing': True,
127
+ 'table_extraction': True,
128
+ 'max_file_size_mb': 100,
129
+
130
+ # Storage
131
+ 'metadata_db_path': './data/metadata.db',
132
+
133
+ # Performance
134
+ 'max_retries': 3,
135
+ 'batch_size': 32,
136
+ 'enable_caching': True,
137
+ 'temperature': 0.1,
138
+ 'max_tokens': 1024
139
+ }
140
+
141
+ # Register document processors
142
+ DocumentProcessorFactory.register_processor(DocumentType.PDF, PDFProcessor)
143
+ DocumentProcessorFactory.register_processor(DocumentType.EXCEL, ExcelProcessor)
144
+ DocumentProcessorFactory.register_processor(DocumentType.IMAGE, ImageProcessor)
145
+
146
+ # Initialize components with error handling
147
+ try:
148
+ self.metadata_manager = MetadataManager(config_dict)
149
+ logger.info("✅ Metadata manager initialized")
150
+
151
+ self.ingestion_pipeline = DocumentIngestionPipeline(config_dict)
152
+ logger.info("✅ Ingestion pipeline initialized")
153
+
154
+ self.rag_engine = RAGEngine(config_dict)
155
+ logger.info("✅ RAG engine initialized")
156
+
157
+ except Exception as e:
158
+ return False, f"Failed to initialize components: {str(e)}"
159
+
160
+ self.system_initialized = True
161
+ return True, "RAG system initialized successfully!"
162
+
163
+ except Exception as e:
164
+ error_msg = f"Failed to initialize RAG system: {str(e)}"
165
+ logger.error(error_msg)
166
+ return False, error_msg
167
+
168
+ def process_uploaded_files(self, files) -> Tuple[str, pd.DataFrame]:
169
+ """Process uploaded files with improved error handling."""
170
+ if not self.system_initialized:
171
+ return "❌ System not initialized. Please initialize first.", pd.DataFrame()
172
+
173
+ if not files:
174
+ return "No files uploaded.", pd.DataFrame()
175
+
176
+ results = []
177
+ total_files = len(files)
178
+
179
+ try:
180
+ for i, file in enumerate(files):
181
+ logger.info(f"Processing file {i+1}/{total_files}: {file.name}")
182
+
183
+ # Save uploaded file temporarily
184
+ temp_path = None
185
+ try:
186
+ # Create temporary file with proper extension
187
+ suffix = Path(file.name).suffix
188
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
189
+ # Read file content
190
+ file_content = file.read()
191
+ tmp_file.write(file_content)
192
+ temp_path = tmp_file.name
193
+
194
+ logger.info(f"Saved temp file: {temp_path}")
195
+
196
+ # Process document
197
+ result = self.ingestion_pipeline.ingest_document(temp_path)
198
+
199
+ # Add result info
200
+ results.append({
201
+ 'Filename': file.name,
202
+ 'Status': '✅ Success' if result.success else '❌ Failed',
203
+ 'Chunks Created': result.chunks_created,
204
+ 'Chunks Indexed': result.chunks_indexed,
205
+ 'Processing Time (s)': f"{result.processing_time:.2f}",
206
+ 'Error Message': result.error_message or 'None'
207
+ })
208
+
209
+ logger.info(f"Processing result: {'Success' if result.success else 'Failed'}")
210
+
211
+ except Exception as e:
212
+ logger.error(f"Error processing {file.name}: {e}")
213
+ results.append({
214
+ 'Filename': file.name,
215
+ 'Status': '❌ Failed',
216
+ 'Chunks Created': 0,
217
+ 'Chunks Indexed': 0,
218
+ 'Processing Time (s)': '0.00',
219
+ 'Error Message': str(e)
220
+ })
221
+
222
+ finally:
223
+ # Clean up temporary file
224
+ if temp_path and os.path.exists(temp_path):
225
+ try:
226
+ os.unlink(temp_path)
227
+ logger.info(f"Cleaned up temp file: {temp_path}")
228
+ except Exception as e:
229
+ logger.warning(f"Failed to clean up temp file: {e}")
230
+
231
+ # Create results summary
232
+ successful = sum(1 for r in results if 'Success' in r['Status'])
233
+ total_chunks = sum(r['Chunks Indexed'] for r in results if isinstance(r['Chunks Indexed'], int))
234
+
235
+ status_msg = f"✅ Processing Complete: {successful}/{total_files} files processed successfully. Total chunks indexed: {total_chunks}"
236
+
237
+ return status_msg, pd.DataFrame(results)
238
+
239
+ except Exception as e:
240
+ error_msg = f"❌ Batch processing failed: {str(e)}"
241
+ logger.error(error_msg)
242
+ return error_msg, pd.DataFrame(results) if results else pd.DataFrame()
243
+
244
+ def ask_question(self, question: str, max_results: int = 5,
245
+ similarity_threshold: float = 0.7) -> Tuple[str, str, pd.DataFrame]:
246
+ """Process a question through the RAG engine with better error handling."""
247
+ if not self.system_initialized:
248
+ return "❌ System not initialized. Please initialize first.", "", pd.DataFrame()
249
+
250
+ if not question.strip():
251
+ return "Please enter a question.", "", pd.DataFrame()
252
+
253
+ try:
254
+ try:
255
+ documents = self.metadata_manager.list_documents(
256
+ status=ProcessingStatus.COMPLETED,
257
+ limit=1
258
+ )
259
+ if not documents:
260
+ return "⚠️ No processed documents available. Please upload and process documents first.", "", pd.DataFrame()
261
+ except Exception as e:
262
+ logger.error(f"Failed to check documents: {e}")
263
+ return "❌ Error checking document availability.", "", pd.DataFrame()
264
+
265
+ # Update RAG engine config temporarily for this query
266
+ original_final_top_k = self.rag_engine.final_top_k
267
+ original_similarity_threshold = self.rag_engine.similarity_threshold
268
+
269
+ self.rag_engine.final_top_k = max_results
270
+ self.rag_engine.similarity_threshold = similarity_threshold
271
+
272
+ # Get response
273
+ logger.info(f"Asking question: {question[:50]}...")
274
+ response = self.rag_engine.answer_question(question)
275
+
276
+ # Restore original config
277
+ self.rag_engine.final_top_k = original_final_top_k
278
+ self.rag_engine.similarity_threshold = original_similarity_threshold
279
+
280
+ # Add to chat history
281
+ self.chat_history.append((question, response))
282
+
283
+ # Format answer
284
+ if not response.success:
285
+ return f"❌ Failed to generate answer: {response.error_message}", "", pd.DataFrame()
286
+
287
+ # Create citations info
288
+ citations_info = self._format_citations(response.citations)
289
+
290
+ # Create performance dataframe
291
+ performance_data = {
292
+ 'Metric': ['Confidence Score', 'Processing Time (s)', 'Retrieval Time (s)',
293
+ 'Generation Time (s)', 'Rerank Time (s)', 'Sources Used', 'Chunks Retrieved'],
294
+ 'Value': [
295
+ f"{response.confidence_score:.3f}",
296
+ f"{response.processing_time:.3f}",
297
+ f"{response.retrieval_time:.3f}",
298
+ f"{response.generation_time:.3f}",
299
+ f"{response.rerank_time:.3f}",
300
+ len(response.citations),
301
+ response.total_chunks_retrieved
302
+ ]
303
+ }
304
+
305
+ performance_df = pd.DataFrame(performance_data)
306
+
307
+ return response.answer, citations_info, performance_df
308
+
309
+ except Exception as e:
310
+ error_msg = f"❌ Question processing failed: {str(e)}"
311
+ logger.error(error_msg)
312
+ return error_msg, "", pd.DataFrame()
313
+
314
+ def _format_citations(self, citations) -> str:
315
+ """Format citations for display."""
316
+ if not citations:
317
+ return "No citations available."
318
+
319
+ citation_text = "## 📚 Sources & Citations\n\n"
320
+
321
+ for i, citation in enumerate(citations):
322
+ citation_text += f"**Source {i+1}:** {citation.source_file} (Confidence: {citation.confidence:.3f})\n"
323
+
324
+ # Add specific location info
325
+ location_parts = []
326
+ if citation.page_number:
327
+ location_parts.append(f"📄 Page: {citation.page_number}")
328
+ if citation.worksheet_name:
329
+ location_parts.append(f"📊 Sheet: {citation.worksheet_name}")
330
+ if citation.cell_range:
331
+ location_parts.append(f"🔢 Range: {citation.cell_range}")
332
+ if citation.section_title:
333
+ location_parts.append(f"📑 Section: {citation.section_title}")
334
+
335
+ if location_parts:
336
+ citation_text += f"*Location:* {' | '.join(location_parts)}\n"
337
+
338
+ citation_text += f"*Excerpt:* \"{citation.text_snippet}\"\n\n"
339
+
340
+ return citation_text
341
+
342
+
343
+
344
+ def get_document_library(self):
345
+ if not self.system_initialized:
346
+ return pd.DataFrame({'Message': ['System not initialized']})
347
+ try:
348
+ documents = self.metadata_manager.list_documents(limit=50)
349
+ if not documents:
350
+ return pd.DataFrame({'Message': ['No documents processed yet']})
351
+ doc_data = []
352
+ for doc in documents:
353
+ doc_data.append({
354
+ 'Filename': doc.filename,
355
+ 'Type': doc.file_type.upper(),
356
+ 'Status': doc.processing_status.value.title(),
357
+ 'Chunks': doc.total_chunks,
358
+ 'Size': self._format_size(doc.file_size),
359
+ 'Uploaded': doc.upload_timestamp.strftime('%Y-%m-%d %H:%M')
360
+ })
361
+ return pd.DataFrame(doc_data)
362
+ except Exception as e:
363
+ logger.error(f"Failed to get document library: {e}")
364
+ return pd.DataFrame({'Error': [str(e)]})
365
+
366
+
367
+ def get_system_status(self) -> Tuple[str, pd.DataFrame]:
368
+ """Get system status and health information."""
369
+ if not self.system_initialized:
370
+ return "❌ System not initialized", pd.DataFrame()
371
+ try:
372
+ # Health checks
373
+ rag_health = self.rag_engine.health_check()
374
+ pipeline_health = self.ingestion_pipeline.health_check()
375
+ # Create status message
376
+ status_parts = []
377
+ all_health = {**rag_health, **pipeline_health}
378
+ for component, healthy in all_health.items():
379
+ status = "✅ Healthy" if healthy else "❌ Unhealthy"
380
+ status_parts.append(f"**{component.replace('_', ' ').title()}:** {status}")
381
+
382
+ status_message = "## 🏥 System Health\n" + "\n".join(status_parts)
383
+
384
+ # Create detailed status table
385
+ health_data = []
386
+ for component, healthy in all_health.items():
387
+ health_data.append({
388
+ 'Component': component.replace('_', ' ').title(),
389
+ 'Status': '✅ Healthy' if healthy else '❌ Unhealthy',
390
+ 'Last Checked': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
391
+ })
392
+
393
+ return status_message, pd.DataFrame(health_data)
394
+
395
+ except Exception as e:
396
+ error_msg = f"❌ Failed to check system status: {str(e)}"
397
+ logger.error(error_msg)
398
+ return error_msg, pd.DataFrame()
399
+
400
+ def _format_file_size(self, size_bytes: int) -> str:
401
+ """Format file size in human readable format."""
402
+ if size_bytes == 0:
403
+ return "0B"
404
+
405
+ size_names = ["B", "KB", "MB", "GB", "TB"]
406
+ i = 0
407
+ while size_bytes >= 1024 and i < len(size_names) - 1:
408
+ size_bytes /= 1024.0
409
+ i += 1
410
+
411
+ return f"{size_bytes:.1f}{size_names[i]}"
412
+
413
+
414
+ def create_gradio_interface():
415
+ """Create the main Gradio interface with proper error handling."""
416
+
417
+ # Initialize demo instance
418
+ demo_instance = RAGGradioDemo()
419
+
420
+ # Define the interface
421
+ with gr.Blocks(title="Manufacturing RAG Agent", theme=gr.themes.Soft()) as demo:
422
+ gr.Markdown("""
423
+ # 🏭 Manufacturing RAG Agent
424
+ *Intelligent document analysis for manufacturing data*
425
+
426
+ This system allows you to upload manufacturing documents (PDF, Excel, Images) and ask questions about their content using SiliconFlow embeddings and Groq LLM.
427
+ """)
428
+
429
+ # System initialization status
430
+ with gr.Row():
431
+ system_status = gr.Markdown("**System Status:** Not initialized")
432
+ init_btn = gr.Button("🚀 Initialize System", variant="primary")
433
+
434
+ # Main functionality tabs
435
+ with gr.Tabs():
436
+ # Document Upload Tab
437
+ with gr.TabItem("📄 Document Upload"):
438
+ gr.Markdown("### Upload and Process Documents")
439
+
440
+ with gr.Row():
441
+ with gr.Column():
442
+ file_upload = gr.File(
443
+ file_count="multiple",
444
+ file_types=[".pdf", ".xlsx", ".xls", ".xlsm", ".png", ".jpg", ".jpeg"],
445
+ label="Choose files to upload (PDF, Excel, Images)"
446
+ )
447
+ upload_btn = gr.Button("🔄 Process Documents", variant="primary")
448
+ upload_status = gr.Textbox(
449
+ label="Processing Status",
450
+ interactive=False,
451
+ lines=3
452
+ )
453
+
454
+ # Results display
455
+ upload_results = gr.Dataframe(
456
+ label="Processing Results",
457
+ interactive=False
458
+ )
459
+
460
+ # Document Library
461
+ gr.Markdown("### 📚 Document Library")
462
+ refresh_docs_btn = gr.Button("🔄 Refresh Library")
463
+ doc_library = gr.Dataframe(
464
+ label="Uploaded Documents",
465
+ interactive=False
466
+ )
467
+
468
+ # Question Answering Tab
469
+ with gr.TabItem("❓ Ask Questions"):
470
+ gr.Markdown("### Ask Questions About Your Documents")
471
+
472
+ with gr.Row():
473
+ with gr.Column(scale=2):
474
+ question_input = gr.Textbox(
475
+ label="Your Question",
476
+ placeholder="e.g., What is the production yield mentioned in the documents?",
477
+ lines=2
478
+ )
479
+ ask_btn = gr.Button("🔍 Ask Question", variant="primary")
480
+
481
+ with gr.Column(scale=1):
482
+ gr.Markdown("#### Settings")
483
+ max_results = gr.Slider(
484
+ minimum=1, maximum=10, value=5, step=1,
485
+ label="Max Context Chunks"
486
+ )
487
+ similarity_threshold = gr.Slider(
488
+ minimum=0.0, maximum=1.0, value=0.7, step=0.1,
489
+ label="Similarity Threshold"
490
+ )
491
+
492
+ # Answer display
493
+ answer_output = gr.Markdown(label="Answer")
494
+ citations_output = gr.Markdown(label="Citations")
495
+
496
+ # Performance metrics
497
+ performance_metrics = gr.Dataframe(
498
+ label="Performance Metrics",
499
+ interactive=False
500
+ )
501
+
502
+ # System Status Tab
503
+ with gr.TabItem("⚙️ System Status"):
504
+ gr.Markdown("### System Health & Information")
505
+
506
+ check_health_btn = gr.Button("🔍 Check System Health")
507
+ health_status = gr.Markdown("Click 'Check System Health' to view status...")
508
+ health_details = gr.Dataframe(
509
+ label="Component Health Details",
510
+ interactive=False
511
+ )
512
+
513
+ # Event handlers
514
+ def initialize_system():
515
+ """Initialize the system and return status."""
516
+ success, message = demo_instance.initialize_system()
517
+ if success:
518
+ return f"**System Status:** <span style='color: green'>✅ {message}</span>"
519
+ else:
520
+ return f"**System Status:** <span style='color: red'>❌ {message}</span>"
521
+
522
+ def process_files(files):
523
+ """Process uploaded files."""
524
+ if not files:
525
+ return "No files selected", pd.DataFrame()
526
+ return demo_instance.process_uploaded_files(files)
527
+
528
+ def ask_question(question, max_results, similarity_threshold):
529
+ """Ask a question."""
530
+ if not question.strip():
531
+ return "Please enter a question", "", pd.DataFrame()
532
+ return demo_instance.ask_question(question, max_results, similarity_threshold)
533
+
534
+ def refresh_library():
535
+ """Refresh document library."""
536
+ return demo_instance.get_document_library()
537
+
538
+ def check_health():
539
+ """Check system health."""
540
+ return demo_instance.get_system_status()
541
+
542
+ # Connect events
543
+ init_btn.click(
544
+ initialize_system,
545
+ outputs=[system_status]
546
+ )
547
+
548
+ upload_btn.click(
549
+ process_files,
550
+ inputs=[file_upload],
551
+ outputs=[upload_status, upload_results]
552
+ )
553
+
554
+ ask_btn.click(
555
+ ask_question,
556
+ inputs=[question_input, max_results, similarity_threshold],
557
+ outputs=[answer_output, citations_output, performance_metrics]
558
+ )
559
+
560
+ refresh_docs_btn.click(
561
+ refresh_library,
562
+ outputs=[doc_library]
563
+ )
564
+
565
+ check_health_btn.click(
566
+ check_health,
567
+ outputs=[health_status, health_details]
568
+ )
569
+
570
+ # Auto-refresh library after upload
571
+ upload_btn.click(
572
+ refresh_library,
573
+ outputs=[doc_library]
574
+ )
575
+
576
+ return demo
577
+
578
+
579
+ def main():
580
+ """Main function to launch the Gradio demo."""
581
+ try:
582
+ # Create directories
583
+ os.makedirs("data", exist_ok=True)
584
+ os.makedirs("logs", exist_ok=True)
585
+
586
+ # Create and launch the interface
587
+ demo = create_gradio_interface()
588
+
589
+ # Launch with configuration
590
+ demo.launch(
591
+ server_name="0.0.0.0",
592
+ server_port=7860,
593
+ share=False,
594
+ debug=True,
595
+ show_error=True
596
+ )
597
+
598
+ except Exception as e:
599
+ print(f"❌ Failed to launch Gradio demo: {e}")
600
+ print("Please check your configuration and dependencies.")
601
+
602
+
603
+ if __name__ == "__main__":
604
+ main()
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ pandas>=1.5.0
3
+ pyyaml>=6.0
4
+ python-dotenv>=1.0.0
5
+ requests>=2.31.0
6
+ numpy>=1.24.0
7
+
8
+ # UI and visualization
9
+ gradio>=4.0.0
10
+ streamlit>=1.28.0
11
+ plotly>=5.15.0
12
+ matplotlib>=3.7.0
13
+ seaborn>=0.12.0
14
+
15
+ # RAG System Dependencies
16
+ groq>=0.4.0
17
+ qdrant-client>=1.7.0
18
+ PyMuPDF>=1.23.0
19
+ openpyxl>=3.1.0
20
+ pytesseract>=0.3.10
21
+ Pillow>=10.0.0
22
+ sqlalchemy>=2.0.0
23
+
24
+ # Optional: For evaluation (may cause protobuf conflicts)
25
+ # deepeval>=0.21.0
26
+ # google-generativeai>=0.3.0
27
+
28
+ # Fix protobuf version conflicts
29
+ protobuf>=4.21.0,<5.0.0
results/evaluation_results_20250912_115542.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_cases": 12,
3
+ "passed_cases": 0,
4
+ "failed_cases": 0,
5
+ "pass_rate": 0.0,
6
+ "average_score": 0,
7
+ "threshold": 0.8,
8
+ "model_name": "gemini-2.0-flash",
9
+ "evaluation_timestamp": "2025-09-12T11:55:42.620678",
10
+ "detailed_results": [
11
+ {
12
+ "case_index": 0,
13
+ "input": "\"이번 달 우리 회사 전체 매출은 얼마야?\"",
14
+ "actual_output": "A: 2025년 1월 삼광 Global 전체 매출은 335.4억원입니다. 이는 당초 사업계획(213.4억원) 대비 57% 초과 달성한 수치이며, 실행계획(307.8억원) 대비도 109% 달성한 성과입니다.",
15
+ "expected_output": null,
16
+ "metrics": {}
17
+ },
18
+ {
19
+ "case_index": 1,
20
+ "input": "\"사업부별 매출 비중이 어떻게 되나요?\"",
21
+ "actual_output": "A: 2025년 1월 기준 사업부별 매출 비중은 다음과 같습니다:\n\n한국 사업부: 213.0억원 (39.7%)\n베트남 사업부: 38.6억원 (44.1%)\n인도 사업부: 미미한 수준\n윈테크: 미미한 수준 한국과 베트남 사업부가 전체 매출의 약 84%를 차지하고 있습니다.\n",
22
+ "expected_output": null,
23
+ "metrics": {}
24
+ },
25
+ {
26
+ "case_index": 2,
27
+ "input": "\"우리 회사 영업이익률은 몇 %야?\"",
28
+ "actual_output": "A: 2025년 1월 전사 영업이익률은 3%입니다. 영업이익은 8.97억원이며, 사업부별로는 한국 4%, 베트남 2%, 윈테크는 -7%의 영업이익률을 기록했습니다.\n\n 생산/품질 관련 질문",
29
+ "expected_output": null,
30
+ "metrics": {}
31
+ },
32
+ {
33
+ "case_index": 3,
34
+ "input": "\"TAB S10 도장 공정 수율이 어떻게 되나요?\"",
35
+ "actual_output": "A: TAB S10 제품의 도장 공정 수율은 평균 98%로 매우 양호합니다. 세부적으로 TAB S10 REAR BODY 도장은 98%, TAB S10 KNOB 도장은 99%의 수율을 보이고 있습니다.",
36
+ "expected_output": null,
37
+ "metrics": {}
38
+ },
39
+ {
40
+ "case_index": 4,
41
+ "input": "\"최근 수율이 낮은 공정이 있나요?\"",
42
+ "actual_output": "A: 네, 몇 가지 주의가 필요한 공정이 있습니다:\n\nR47 ENCLOSURE, LOWER, BATTERY, LARGE 사출: 59%\nR47 ARM, FRONT RIGHT, UPPER 사출: 80%\nTab S10 FE FRONT BODY 사출: 87% 이 공정들은 90% 미만의 수율로 개선이 필요합니다.",
43
+ "expected_output": null,
44
+ "metrics": {}
45
+ },
46
+ {
47
+ "case_index": 5,
48
+ "input": "\"삼성 폴더블폰 부품(SM-F 시리즈) 생산 현황은?\"",
49
+ "actual_output": "A: 삼성 폴더블폰 부품 생산이 활발합니다:\n\nSM-F721U: FRONT DECO MAIN/SUB NC 공정 수율 96-97%\nSM-F731U: NC 공정 수율 97%, 조립 수율 100%\nSM-F741U: NC 공정 수율 95%, 레이저 공정 수율 99%\nSM-F936U: NC 및 조립 공정 모두 100% 수율 달성\n 재고 관련 질문",
50
+ "expected_output": null,
51
+ "metrics": {}
52
+ },
53
+ {
54
+ "case_index": 6,
55
+ "input": "\"R47 시리즈 재고 현황이 어떻게 되나요?\"",
56
+ "actual_output": "A: R47 시리즈 주요 품목 재고 현황:\n\nR47 ENCLOSURE, LOWER, BATTERY, LARGE 가공품: 568 EA (양품)\n기타 R47 부품들은 현재 재고가 없는 상태입니다.\n대부분 게이트 커팅 가공이나 사출 공정을 거치는 부품들입니다.",
57
+ "expected_output": null,
58
+ "metrics": {}
59
+ },
60
+ {
61
+ "case_index": 7,
62
+ "input": "\"C18 제품군 재고가 있나요?\"",
63
+ "actual_output": "A: C18 제품군은 모두 재고가 0인 상태입니다. CLAMSHELL COVER, ENCLOSURE 등 주요 부품들이 재고 소진 상태이므로 생산 계획 수립이 필요합니다.\n\n 원가 관련 질문",
64
+ "expected_output": null,
65
+ "metrics": {}
66
+ },
67
+ {
68
+ "case_index": 8,
69
+ "input": "\"우리 회사 매출원가율이 높은 이유가 뭐야?\"",
70
+ "actual_output": "A: 2025년 1월 전사 매출원가율은 92%로 매우 높습니다. 주요 원인은:\n\n매입비(원부자재+외주가공비): 67% - 가장 큰 비중\n노무비: 12%\n제조경비: 11% 특히 베트남 사업부(94%)와 인도 사업부(92%)의 매출원가율이 높아 수익성 개선이 시급합니다.",
71
+ "expected_output": null,
72
+ "metrics": {}
73
+ },
74
+ {
75
+ "case_index": 9,
76
+ "input": "\"실패비용이 얼마나 발생했나요?\"",
77
+ "actual_output": "A: 2025년 1월 전사 실패비용은 5.16억원(매출 대비 2%)입니다. 사업부별로는:\n\n한국: 0.23억원 (1%)\n베트남: 3.95억원 (2%) - 가장 높음\n인도: 0.48억원 (1%)\n윈테크: 0.50억원 (1%) 베트남 사업부의 실패비용 절감이 필요합니다.\n 작업 관련 질문",
78
+ "expected_output": null,
79
+ "metrics": {}
80
+ },
81
+ {
82
+ "case_index": 10,
83
+ "input": "\"SMF741UB6 조립 작업 시 주의사항이 뭐야?\"",
84
+ "actual_output": "A: SMF741UB6 FRONT DECO SUB 조립 작업표준서에 따른 주요 주의사항을 확인해야 합니다. 2024년 7월 8일에 조립 부분이 수정된 최신 버전을 참고하시기 바랍니다.",
85
+ "expected_output": null,
86
+ "metrics": {}
87
+ },
88
+ {
89
+ "case_index": 11,
90
+ "input": "\"이번 달 생산성이 가장 좋은 공정은?\"",
91
+ "actual_output": "A: 다음 공정들이 100% 수율을 달성했습니다:\n\nSM-F936U NC 및 조립 공정\nC18 SHIM 가공 및 사출\nPA3 DECO 아노다이징, 샌딩, 버핑\n대부분의 조립(ASS'Y) 공정 이들 공정은 벤치마킹 대상으로 삼을 수 있습니다.",
92
+ "expected_output": null,
93
+ "metrics": {}
94
+ }
95
+ ]
96
+ }
setup_local.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # #!/usr/bin/env python3
2
+ # """
3
+ # Local setup script for Manufacturing RAG Agent without Docker.
4
+
5
+ # This script helps set up the RAG system using Qdrant Cloud or local alternatives.
6
+ # """
7
+
8
+ # import os
9
+ # import sys
10
+ # import subprocess
11
+ # from pathlib import Path
12
+ # import requests
13
+ # import time
14
+
15
+ # def check_python_version():
16
+ # """Check if Python version is compatible."""
17
+ # if sys.version_info < (3, 8):
18
+ # print("❌ Python 3.8+ is required")
19
+ # return False
20
+ # print(f"✅ Python {sys.version_info.major}.{sys.version_info.minor} detected")
21
+ # return True
22
+
23
+ # def check_dependencies():
24
+ # """Check if required dependencies are installed."""
25
+ # print("\n🔍 Checking dependencies...")
26
+
27
+ # # Check pip packages
28
+ # required_packages = [
29
+ # 'streamlit', 'qdrant-client', 'groq', 'requests',
30
+ # 'pandas', 'plotly', 'PyMuPDF', 'openpyxl', 'pytesseract', 'Pillow'
31
+ # ]
32
+
33
+ # missing_packages = []
34
+ # for package in required_packages:
35
+ # try:
36
+ # __import__(package.replace('-', '_').lower())
37
+ # print(f"✅ {package}")
38
+ # except ImportError:
39
+ # print(f"❌ {package}")
40
+ # missing_packages.append(package)
41
+
42
+ # if missing_packages:
43
+ # print(f"\n📦 Installing missing packages: {', '.join(missing_packages)}")
44
+ # try:
45
+ # subprocess.check_call([
46
+ # sys.executable, '-m', 'pip', 'install'
47
+ # ] + missing_packages)
48
+ # print("✅ All packages installed successfully")
49
+ # except subprocess.CalledProcessError:
50
+ # print("❌ Failed to install packages")
51
+ # return False
52
+
53
+ # return True
54
+
55
+ # def check_tesseract():
56
+ # """Check if Tesseract OCR is installed."""
57
+ # print("\n🔍 Checking Tesseract OCR...")
58
+ # try:
59
+ # result = subprocess.run(['tesseract', '--version'],
60
+ # capture_output=True, text=True)
61
+ # if result.returncode == 0:
62
+ # version = result.stdout.split('\n')[0]
63
+ # print(f"✅ {version}")
64
+ # return True
65
+ # except FileNotFoundError:
66
+ # pass
67
+
68
+ # print("❌ Tesseract OCR not found")
69
+ # print("📥 Install with: brew install tesseract")
70
+ # return False
71
+
72
+ # def setup_environment():
73
+ # """Set up environment variables."""
74
+ # print("\n⚙️ Setting up environment...")
75
+
76
+ # env_file = Path('.env')
77
+ # env_example = Path('.env.example')
78
+
79
+ # if not env_file.exists() and env_example.exists():
80
+ # # Copy example file
81
+ # with open(env_example, 'r') as f:
82
+ # content = f.read()
83
+
84
+ # with open(env_file, 'w') as f:
85
+ # f.write(content)
86
+
87
+ # print("✅ Created .env file from template")
88
+ # print("📝 Please edit .env file with your API keys:")
89
+ # print(" - GROQ_API_KEY (get from https://console.groq.com/)")
90
+ # print(" - SILICONFLOW_API_KEY (get from https://siliconflow.cn/)")
91
+ # print(" - QDRANT_URL and QDRANT_API_KEY (if using Qdrant Cloud)")
92
+ # return False
93
+ # elif env_file.exists():
94
+ # print("✅ .env file exists")
95
+ # return True
96
+ # else:
97
+ # print("❌ No .env.example file found")
98
+ # return False
99
+
100
+ # def test_qdrant_connection():
101
+ # """Test Qdrant connection."""
102
+ # print("\n🔍 Testing Qdrant connection...")
103
+
104
+ # # Try local Qdrant first
105
+ # try:
106
+ # response = requests.get('http://localhost:6333/health', timeout=5)
107
+ # if response.status_code == 200:
108
+ # print("✅ Local Qdrant is running")
109
+ # return True
110
+ # except requests.exceptions.RequestException:
111
+ # pass
112
+
113
+ # # Try Qdrant Cloud if configured
114
+ # qdrant_url = os.getenv('QDRANT_URL')
115
+ # qdrant_key = os.getenv('QDRANT_API_KEY')
116
+
117
+ # if qdrant_url and qdrant_key:
118
+ # try:
119
+ # headers = {'api-key': qdrant_key} if qdrant_key else {}
120
+ # response = requests.get(f"{qdrant_url}/health", headers=headers, timeout=10)
121
+ # if response.status_code == 200:
122
+ # print("✅ Qdrant Cloud connection successful")
123
+ # return True
124
+ # except requests.exceptions.RequestException as e:
125
+ # print(f"❌ Qdrant Cloud connection failed: {e}")
126
+
127
+ # print("❌ No Qdrant connection available")
128
+ # print("💡 Options:")
129
+ # print(" 1. Start local Qdrant: docker run -p 6333:6333 qdrant/qdrant")
130
+ # print(" 2. Use Qdrant Cloud: https://cloud.qdrant.io/")
131
+ # print(" 3. Download Qdrant binary: https://github.com/qdrant/qdrant/releases")
132
+ # return False
133
+
134
+ # def create_data_directories():
135
+ # """Create necessary data directories."""
136
+ # print("\n📁 Creating data directories...")
137
+
138
+ # directories = [
139
+ # 'data',
140
+ # 'data/documents',
141
+ # 'logs',
142
+ # 'results'
143
+ # ]
144
+
145
+ # for directory in directories:
146
+ # Path(directory).mkdir(parents=True, exist_ok=True)
147
+ # print(f"✅ {directory}/")
148
+
149
+ # return True
150
+
151
+ # def test_api_keys():
152
+ # """Test API key configuration."""
153
+ # print("\n🔑 Testing API keys...")
154
+
155
+ # # Load environment variables
156
+ # from dotenv import load_dotenv
157
+ # load_dotenv()
158
+
159
+ # # Test Groq API
160
+ # groq_key = os.getenv('GROQ_API_KEY')
161
+ # if groq_key:
162
+ # try:
163
+ # headers = {'Authorization': f'Bearer {groq_key}'}
164
+ # response = requests.get('https://api.groq.com/openai/v1/models',
165
+ # headers=headers, timeout=10)
166
+ # if response.status_code == 200:
167
+ # print("✅ Groq API key is valid")
168
+ # else:
169
+ # print(f"❌ Groq API key test failed: {response.status_code}")
170
+ # except requests.exceptions.RequestException as e:
171
+ # print(f"❌ Groq API connection failed: {e}")
172
+ # else:
173
+ # print("❌ GROQ_API_KEY not set")
174
+
175
+ # # Test Silicon Flow API
176
+ # sf_key = os.getenv('SILICONFLOW_API_KEY')
177
+ # if sf_key:
178
+ # print("✅ Silicon Flow API key is set (cannot test without making API call)")
179
+ # else:
180
+ # print("❌ SILICONFLOW_API_KEY not set")
181
+
182
+ # return groq_key and sf_key
183
+
184
+ # def main():
185
+ # """Main setup function."""
186
+ # print("🏭 Manufacturing RAG Agent Setup")
187
+ # print("=" * 40)
188
+
189
+ # # Check Python version
190
+ # if not check_python_version():
191
+ # sys.exit(1)
192
+
193
+ # # Check dependencies
194
+ # if not check_dependencies():
195
+ # print("\n❌ Dependency check failed")
196
+ # sys.exit(1)
197
+
198
+ # # Check Tesseract
199
+ # tesseract_ok = check_tesseract()
200
+
201
+ # # Setup environment
202
+ # env_setup = setup_environment()
203
+
204
+ # # Create directories
205
+ # create_data_directories()
206
+
207
+ # if env_setup:
208
+ # # Test API keys
209
+ # api_keys_ok = test_api_keys()
210
+
211
+ # # Test Qdrant
212
+ # qdrant_ok = test_qdrant_connection()
213
+
214
+ # print("\n" + "=" * 40)
215
+ # print("📋 Setup Summary:")
216
+ # print(f"✅ Python: OK")
217
+ # print(f"✅ Dependencies: OK")
218
+ # print(f"{'✅' if tesseract_ok else '❌'} Tesseract OCR: {'OK' if tesseract_ok else 'Missing'}")
219
+ # print(f"{'✅' if api_keys_ok else '❌'} API Keys: {'OK' if api_keys_ok else 'Missing'}")
220
+ # print(f"{'✅' if qdrant_ok else '❌'} Qdrant: {'OK' if qdrant_ok else 'Not available'}")
221
+
222
+ # if tesseract_ok and api_keys_ok and qdrant_ok:
223
+ # print("\n🎉 Setup complete! You can now run:")
224
+ # print(" python launch_rag_demo.py")
225
+ # else:
226
+ # print("\n⚠️ Setup incomplete. Please address the issues above.")
227
+ # if not tesseract_ok:
228
+ # print(" Install Tesseract: brew install tesseract")
229
+ # if not api_keys_ok:
230
+ # print(" Configure API keys in .env file")
231
+ # if not qdrant_ok:
232
+ # print(" Set up Qdrant (see options above)")
233
+ # else:
234
+ # print("\n📝 Please configure your .env file with API keys, then run this script again.")
235
+
236
+ # if __name__ == "__main__":
237
+ # main()
238
+
239
+
240
+
241
+
242
+
243
+ #!/usr/bin/env python3
244
+ """
245
+ Quick fix script for Manufacturing RAG Agent issues
246
+ """
247
+
248
+ import os
249
+ import sys
250
+ from dotenv import load_dotenv
251
+ from qdrant_client import QdrantClient
252
+ from qdrant_client.http import models
253
+ import requests
254
+
255
+ load_dotenv()
256
+
257
+ def check_environment():
258
+ """Check all environment variables."""
259
+ print("🔍 Checking Environment Variables")
260
+ print("=" * 40)
261
+
262
+ required_vars = {
263
+ 'GROQ_API_KEY': 'Groq LLM API',
264
+ 'SILICONFLOW_API_KEY': 'SiliconFlow Embedding API',
265
+ 'QDRANT_URL': 'Qdrant Vector Database URL',
266
+ 'QDRANT_API_KEY': 'Qdrant API Key'
267
+ }
268
+
269
+ issues = []
270
+
271
+ for var, description in required_vars.items():
272
+ value = os.getenv(var)
273
+ if value:
274
+ # Mask sensitive info
275
+ if 'KEY' in var:
276
+ display_value = f"{value[:8]}...{value[-4:]}" if len(value) > 12 else "***"
277
+ else:
278
+ display_value = value
279
+ print(f"✅ {var}: {display_value}")
280
+ else:
281
+ print(f"❌ {var}: Not set")
282
+ issues.append(f"{var} ({description})")
283
+
284
+ if issues:
285
+ print(f"\n❌ Missing environment variables:")
286
+ for issue in issues:
287
+ print(f" - {issue}")
288
+ return False
289
+
290
+ return True
291
+
292
+ def test_siliconflow_api():
293
+ """Test SiliconFlow API connection and get actual embedding dimensions."""
294
+ print("\n🧪 Testing SiliconFlow API")
295
+ print("=" * 30)
296
+
297
+ api_key = os.getenv('SILICONFLOW_API_KEY')
298
+ if not api_key:
299
+ print("❌ No API key found")
300
+ return None
301
+
302
+ try:
303
+ payload = {
304
+ "model": "Qwen/Qwen3-Embedding-8B",
305
+ "input": ["test embedding to check dimensions"],
306
+ "encoding_format": "float"
307
+ }
308
+
309
+ headers = {
310
+ 'Authorization': f'Bearer {api_key}',
311
+ 'Content-Type': 'application/json'
312
+ }
313
+
314
+ print("📡 Testing API connection...")
315
+ response = requests.post(
316
+ "https://api.siliconflow.com/v1/embeddings",
317
+ json=payload,
318
+ headers=headers,
319
+ timeout=15
320
+ )
321
+
322
+ if response.status_code == 200:
323
+ data = response.json()
324
+ if 'data' in data and len(data['data']) > 0:
325
+ embedding = data['data'][0]['embedding']
326
+ dimensions = len(embedding)
327
+ print(f"✅ API working! Embedding dimensions: {dimensions}")
328
+ return dimensions
329
+ else:
330
+ print("❌ No embedding data returned")
331
+ elif response.status_code == 401:
332
+ print("❌ API Key Invalid - Please check your SILICONFLOW_API_KEY")
333
+ print(" Get a new key from: https://siliconflow.com/")
334
+ else:
335
+ print(f"❌ API Error: {response.status_code}")
336
+ print(f"Response: {response.text}")
337
+
338
+ except Exception as e:
339
+ print(f"❌ Connection error: {e}")
340
+
341
+ return None
342
+
343
+ def test_qdrant_connection():
344
+ """Test Qdrant connection."""
345
+ print("\n🗄️ Testing Qdrant Connection")
346
+ print("=" * 35)
347
+
348
+
349
+ qdrant_api_key= os.getenv('QDRANT_API_KEY', 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.DHeUsIY234NwS-6cYDJec807Vdzbs1PHmBBU3_Jz9oo')
350
+ # QDRANT_URL=os.getenv('QDRANT_URL', 'https://50f53cc8-bbb0-4939-8254-8f025a577222.us-west-2-0.aws.cloud.qdrant.io:6333')
351
+
352
+ # qdrant_api_key = ""
353
+ qdrant_url= os.getenv('QDRANT_URL', 'http://localhost:6333')
354
+
355
+
356
+ if not qdrant_url:
357
+ print("❌ No Qdrant URL found")
358
+ return False
359
+
360
+ try:
361
+ print(f"🔗 Connecting to: {qdrant_url}")
362
+ client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
363
+
364
+ # Test connection
365
+ collections = client.get_collections()
366
+ print(f"✅ Connected! Found {len(collections.collections)} collections")
367
+
368
+ # Check manufacturing_docs collection
369
+ collection_names = [col.name for col in collections.collections]
370
+ if 'manufacturing_docs' in collection_names:
371
+ collection_info = client.get_collection('manufacturing_docs')
372
+ current_dim = collection_info.config.params.vectors.size
373
+ points_count = collection_info.points_count
374
+ print(f"📋 Collection 'manufacturing_docs' exists:")
375
+ print(f" - Vector dimensions: {current_dim}")
376
+ print(f" - Points count: {points_count}")
377
+ return current_dim
378
+ else:
379
+ print("ℹ️ Collection 'manufacturing_docs' doesn't exist yet")
380
+ return 0
381
+
382
+ except Exception as e:
383
+ print(f"❌ Connection failed: {e}")
384
+ return False
385
+
386
+ def fix_qdrant_collection(correct_dimensions):
387
+ """Fix the Qdrant collection with correct dimensions."""
388
+ print(f"\n🔧 Fixing Qdrant Collection (Dimensions: {correct_dimensions})")
389
+ print("=" * 60)
390
+
391
+ qdrant_url = os.getenv('QDRANT_URL')
392
+ qdrant_api_key = os.getenv('QDRANT_API_KEY')
393
+ collection_name = 'manufacturing_docs'
394
+
395
+ try:
396
+ client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
397
+
398
+ # Check current collection
399
+ collections = client.get_collections()
400
+ collection_names = [col.name for col in collections.collections]
401
+
402
+ if collection_name in collection_names:
403
+ collection_info = client.get_collection(collection_name)
404
+ current_dim = collection_info.config.params.vectors.size
405
+
406
+ if current_dim == correct_dimensions:
407
+ print(f"✅ Collection already has correct dimensions ({correct_dimensions})")
408
+ return True
409
+
410
+ print(f"🗑️ Deleting existing collection (wrong dimensions: {current_dim})...")
411
+ client.delete_collection(collection_name)
412
+
413
+ # Create new collection
414
+ print(f"🆕 Creating collection with {correct_dimensions} dimensions...")
415
+ client.create_collection(
416
+ collection_name=collection_name,
417
+ vectors_config=models.VectorParams(
418
+ size=correct_dimensions,
419
+ distance=models.Distance.COSINE
420
+ )
421
+ )
422
+
423
+ # Create indexes
424
+ print("🔍 Creating payload indexes...")
425
+ indexes = [
426
+ ("document_id", models.KeywordIndexParams()),
427
+ ("chunk_id", models.KeywordIndexParams()),
428
+ ("page_number", models.IntegerIndexParams()),
429
+ ("worksheet_name", models.KeywordIndexParams()),
430
+ ]
431
+
432
+ for field_name, field_schema in indexes:
433
+ try:
434
+ client.create_payload_index(
435
+ collection_name=collection_name,
436
+ field_name=field_name,
437
+ field_schema=field_schema
438
+ )
439
+ except Exception as e:
440
+ print(f"⚠️ Index creation warning for {field_name}: {e}")
441
+
442
+ print("✅ Collection fixed successfully!")
443
+ return True
444
+
445
+ except Exception as e:
446
+ print(f"❌ Failed to fix collection: {e}")
447
+ return False
448
+
449
+ def update_gradio_demo():
450
+ """Update the Gradio demo with correct vector dimensions."""
451
+ print("\n📝 Updating Gradio Demo")
452
+ print("=" * 25)
453
+
454
+ # Check if the demo file exists
455
+ demo_files = [
456
+ 'fixed_gradio_demo.py',
457
+ 'fixed_gradio_file_handling.py',
458
+ 'gradio_demo.py'
459
+ ]
460
+
461
+ demo_file = None
462
+ for file in demo_files:
463
+ if os.path.exists(file):
464
+ demo_file = file
465
+ break
466
+
467
+ if not demo_file:
468
+ print("❌ No Gradio demo file found")
469
+ print("Please create fixed_gradio_demo.py with the corrected code")
470
+ return False
471
+
472
+ try:
473
+ # Read the file
474
+ with open(demo_file, 'r') as f:
475
+ content = f.read()
476
+
477
+ # Update vector_size
478
+ if "'vector_size': 1024," in content:
479
+ content = content.replace("'vector_size': 1024,", "'vector_size': 4096,")
480
+ print("✅ Updated vector_size from 1024 to 4096")
481
+ elif "'vector_size': 4096," in content:
482
+ print("✅ Vector size already correct (4096)")
483
+ else:
484
+ print("⚠️ Could not find vector_size in demo file")
485
+
486
+ # Write back
487
+ with open(demo_file, 'w') as f:
488
+ f.write(content)
489
+
490
+ print(f"✅ Updated {demo_file}")
491
+ return True
492
+
493
+ except Exception as e:
494
+ print(f"❌ Failed to update demo: {e}")
495
+ return False
496
+
497
+ def create_quick_demo():
498
+ """Create a quick working demo file."""
499
+ print("\n🚀 Creating Quick Demo")
500
+ print("=" * 22)
501
+
502
+ demo_content = '''import gradio as gr
503
+ import os
504
+ from dotenv import load_dotenv
505
+ load_dotenv()
506
+
507
+ # Quick test demo
508
+ def test_apis():
509
+ """Test API connections."""
510
+ results = []
511
+
512
+ # Test Groq
513
+ groq_key = os.getenv('GROQ_API_KEY')
514
+ results.append(f"Groq API Key: {'✅ Set' if groq_key else '❌ Missing'}")
515
+
516
+ # Test SiliconFlow
517
+ sf_key = os.getenv('SILICONFLOW_API_KEY')
518
+ results.append(f"SiliconFlow API Key: {'✅ Set' if sf_key else '❌ Missing'}")
519
+
520
+ # Test Qdrant
521
+ qdrant_url = os.getenv('QDRANT_URL')
522
+ results.append(f"Qdrant URL: {'✅ Set' if qdrant_url else '❌ Missing'}")
523
+
524
+ return "\\n".join(results)
525
+
526
+ # Create simple interface
527
+ with gr.Blocks(title="RAG System Test") as demo:
528
+ gr.Markdown("# 🧪 RAG System API Test")
529
+
530
+ test_btn = gr.Button("Test APIs")
531
+ output = gr.Textbox(label="Results", lines=10)
532
+
533
+ test_btn.click(test_apis, outputs=output)
534
+
535
+ if __name__ == "__main__":
536
+ demo.launch(server_name="0.0.0.0", server_port=7860)
537
+ '''
538
+
539
+ with open('quick_test_demo.py', 'w') as f:
540
+ f.write(demo_content)
541
+
542
+ print("✅ Created quick_test_demo.py")
543
+ return True
544
+
545
+ def main():
546
+ """Main fix function."""
547
+ print("🏭 Manufacturing RAG Agent - Quick Fix")
548
+ print("=" * 50)
549
+
550
+ # Step 1: Check environment
551
+ if not check_environment():
552
+ print("\n❌ Environment issues found. Please fix your .env file first.")
553
+ return
554
+
555
+ # Step 2: Test SiliconFlow API and get dimensions
556
+ dimensions = test_siliconflow_api()
557
+ if not dimensions:
558
+ print("\n❌ SiliconFlow API test failed. Please check your API key.")
559
+ return
560
+
561
+ # Step 3: Test Qdrant
562
+ current_dim = test_qdrant_connection()
563
+ if current_dim is False:
564
+ print("\n❌ Qdrant connection failed. Please check your Qdrant configuration.")
565
+ return
566
+
567
+ # Step 4: Fix Qdrant collection if needed
568
+ if current_dim != dimensions:
569
+ print(f"\n⚠️ Collection needs fixing: {current_dim} → {dimensions}")
570
+ if fix_qdrant_collection(dimensions):
571
+ print("✅ Qdrant collection fixed!")
572
+ else:
573
+ print("❌ Failed to fix Qdrant collection")
574
+ return
575
+
576
+ # Step 5: Update demo file
577
+ if update_gradio_demo():
578
+ print("✅ Demo file updated!")
579
+
580
+ # Step 6: Create quick test demo
581
+ create_quick_demo()
582
+
583
+ print("\n🎉 All fixes applied!")
584
+ print("\n📋 Next Steps:")
585
+ print("1. Test APIs: python quick_test_demo.py")
586
+ print("2. Run full demo: python fixed_gradio_demo.py")
587
+ print("3. Upload documents and test questions")
588
+
589
+ if __name__ == "__main__":
590
+ main()
src/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manufacturing RAG Agent Package
3
+
4
+ This package contains the core components for the Manufacturing RAG (Retrieval-Augmented Generation) Agent,
5
+ including document processing, vector storage, embedding generation, and question answering capabilities.
6
+ """
7
+
8
+ from src.document_processor import DocumentProcessor, ProcessedDocument, DocumentChunk
9
+ from src.embedding_system import EmbeddingSystem
10
+ from src.vector_store import QdrantVectorStore
11
+ from src.rag_engine import RAGEngine, RAGResponse
12
+ from src.metadata_manager import MetadataManager
13
+
14
+ __all__ = [
15
+ 'DocumentProcessor',
16
+ 'ProcessedDocument',
17
+ 'DocumentChunk',
18
+ 'EmbeddingSystem',
19
+ 'QdrantVectorStore',
20
+ 'RAGEngine',
21
+ 'RAGResponse',
22
+ 'MetadataManager'
23
+ ]
src/clients.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import google.genai as genai
3
+ from utilites import load_yaml_config
4
+ from groq import Groq
5
+ from openai import OpenAI
6
+ from datetime import datetime
7
+ import requests
8
+ import logger
9
+ import os
10
+ import sys
11
+
12
+ import logging
13
+
14
+ # Import logger here to avoid circular imports
15
+ try:
16
+ from logger.custom_logger import CustomLoggerTracker
17
+ custom_log = CustomLoggerTracker()
18
+ logger = custom_log.get_logger("clients")
19
+ except ImportError:
20
+ # Fallback to standard logging if custom logger not available
21
+ logger = logging.getLogger("clients")
22
+
23
+ # Load environment variables
24
+ load_dotenv()
25
+
26
+ config = load_yaml_config("rag_config.yaml")
27
+
28
+
29
+
30
+ ## Groq
31
+ GROQ_URL = os.environ["GROQ_URL"]
32
+ GROQ_API_TOKEN= os.environ["GROQ_API_TOKEN"]
33
+
34
+
35
+ ## Deepinfra
36
+ DEEPINFRA_API_KEY = os.environ["DEEPINFRA_API_KEY"]
37
+ DEEPINFRA_URL = os.environ["DEEPINFRA_URL"]
38
+ DEEPINFRA_EMBEDDING_URL = os.environ["DEEPINFRA_EMBEDDING_URL"]
39
+ DEEPINFRA_RERANK_URL = os.environ["DEEPINFRA_RERANK_URL"]
40
+
41
+
42
+
43
+ # def qwen_generate_content(prompt: str) -> str:
44
+ # """Streaming chat completion with Qwen on SiliconFlow via OpenAI client."""
45
+ # if not (os.environ['SILICONFLOW_URL'] and os.environ['SILICONFLOW_API_KEY']):
46
+ # logger.error("SILICONFLOW_URL or SILICONFLOW_API_KEY not configured.")
47
+ # return ""
48
+
49
+ # client = OpenAI(base_url=os.environ['SILICONFLOW_URL'], api_key=os.environ['SILICONFLOW_API_KEY'])
50
+ # logger.info("Calling Qwen/Qwen3-30B-Instruct for generation...")
51
+ # output = ""
52
+ # logger.info(f"{config['apis_models']['silicon_flow']['qwen']['chat3_30b']}")
53
+ # response = client.chat.completions.create(
54
+ # model=config["apis_models"]["silicon_flow"]["qwen"]["chat3_30b"],
55
+ # messages=[{"role": "user", "content": prompt}],
56
+ # stream=True)
57
+
58
+ # for chunk in response:
59
+ # if not getattr(chunk, "choices", None):
60
+ # continue
61
+ # delta = chunk.choices[0].delta
62
+ # if getattr(delta, "content", None):
63
+ # output += delta.content
64
+
65
+ # # if hasattr(delta, "reasoning_content") and delta.reasoning_content:
66
+ # # output += delta.reasoning_content
67
+
68
+ # logger.info("Successfully generated content with Qwen")
69
+ # return output.strip()
70
+
71
+
72
+
73
+ def groq_qwen_generate_content(prompt: str) -> str:
74
+ """Streaming chat completion with Qwen on SiliconFlow via OpenAI client."""
75
+ if not (GROQ_URL and GROQ_API_TOKEN):
76
+ logger.error("GROQ_URL or GROQ_API_TOKEN not configured.")
77
+ return ""
78
+
79
+ client = OpenAI(base_url=GROQ_URL, api_key=GROQ_API_TOKEN)
80
+ if client is None:
81
+ logger.error("Failed to initialize Groq client.")
82
+ return ""
83
+ else:
84
+ logger.info("Successfully initialized Groq client.")
85
+ # logger.info("Calling Qwen/Qwen3-32B for generation from Groq...")
86
+ logger.info("Calling openai/gpt-oss-120b for generation from Groq")
87
+
88
+ output = ""
89
+ response = client.chat.completions.create(
90
+ # model=config["apis_models"]["groq"]["qwen"]["chat3_32b"],
91
+ model = config["apis_models"]["groq"]["openai"]["gpt_oss"],
92
+ messages=[{"role": "user", "content": prompt}],
93
+ stream=True,)
94
+ # reasoning_effort="none")
95
+ for chunk in response:
96
+ if not getattr(chunk, "choices", None):
97
+ continue
98
+ delta = chunk.choices[0].delta
99
+ if getattr(delta, "content", None):
100
+ output += delta.content
101
+ if hasattr(delta, "reasoning_content") and delta.reasoning_content:
102
+ output += delta.reasoning_content
103
+ logger.info("Successfully generated content with Qwen")
104
+ return output.strip()
105
+
106
+
107
+
108
+
109
+ def siliconflow_qwen_generate_content(prompt: str) -> str:
110
+ """Streaming chat completion with Qwen on SiliconFlow via OpenAI client."""
111
+ if not (os.environ['SILICONFLOW_URL'] and os.environ['SILICONFLOW_API_KEY']):
112
+ logger.error("SILICONFLOW_URL or SILICONFLOW_API_KEY not configured.")
113
+ return ""
114
+ client = OpenAI(base_url=os.environ['SILICONFLOW_URL'], api_key=os.environ['SILICONFLOW_API_KEY'])
115
+ if client is None:
116
+ logger.error("Failed to initialize SiliconFlow client.")
117
+ return ""
118
+ else:
119
+ logger.info("Successfully initialized SiliconFlow client.")
120
+ logger.info("Calling Qwen/Qwen3-30B-Instruct for generation...")
121
+ output = ""
122
+ logger.info(f"{config['apis_models']['silicon_flow']['qwen']['chat3_30b']}")
123
+ response = client.chat.completions.create(
124
+ model=config["apis_models"]["silicon_flow"]["qwen"]["chat3_30b"],
125
+ messages=[{"role": "user", "content": prompt}],
126
+ stream=True)
127
+ for chunk in response:
128
+ if not getattr(chunk, "choices", None):
129
+ continue
130
+ delta = chunk.choices[0].delta
131
+ if getattr(delta, "content", None):
132
+ output += delta.content
133
+ if hasattr(delta, "reasoning_content") and delta.reasoning_content:
134
+ output += delta.reasoning_content
135
+ logger.info("Successfully generated content with Qwen")
136
+ return output.strip()
137
+
138
+
139
+
140
+ def deepinfra_qwen_generate_content(prompt: str) -> str:
141
+ """Streaming chat completion with Qwen on SiliconFlow via OpenAI client."""
142
+ if not (DEEPINFRA_URL and DEEPINFRA_API_KEY):
143
+ logger.error("GROQ_URL or GROQ_API_TOKEN not configured.")
144
+ return ""
145
+
146
+ client = OpenAI(base_url=DEEPINFRA_URL, api_key=DEEPINFRA_API_KEY)
147
+ if client is None:
148
+ logger.error("Failed to initialize Groq client.")
149
+ return ""
150
+ else:
151
+ logger.info("Successfully initialized Groq client.")
152
+ # logger.info("Calling Qwen/Qwen3-32B for generation from DeepInfra...")
153
+ logger.info("Calling openai gpt-oss-120b for generation from DeepInfra...")
154
+ output = ""
155
+ response = client.chat.completions.create(
156
+ # model=config["apis_models"]["groq"]["qwen"]["chat3_32b"],
157
+ model = config["apis_models"]["groq"]["openai"]["gpt_oss"],
158
+ messages=[{"role": "user", "content": prompt}],
159
+ temperature=1,
160
+ max_completion_tokens=8192,
161
+ top_p=1,
162
+ reasoning_effort="low",
163
+ stream=True,
164
+ tools=[{"type":"browser_search"}])
165
+
166
+ # reasoning_effort="none")
167
+ for chunk in response:
168
+ if not getattr(chunk, "choices", None):
169
+ continue
170
+ delta = chunk.choices[0].delta
171
+ if getattr(delta, "content", None):
172
+ output += delta.content
173
+ if hasattr(delta, "reasoning_content") and delta.reasoning_content:
174
+ output += delta.reasoning_content
175
+ logger.info("Successfully generated content with Qwen")
176
+ return output.strip()
177
+
178
+
179
+
180
+ def deepinfra_embedding(texts: list[str], batch_size: int = 50) -> list[list[float]]:
181
+ all_embeddings = []
182
+ headers = {
183
+ "Authorization": f"Bearer {DEEPINFRA_API_KEY}",
184
+ "Content-Type": "application/json"}
185
+ for i in range(0, len(texts), batch_size):
186
+ batch = texts[i:i + batch_size]
187
+ payload = {
188
+ "model": config["apis_models"]["deepinfra"]["qwen"]["embed"],
189
+ "input": batch}
190
+ try:
191
+ response = requests.post(
192
+ DEEPINFRA_EMBEDDING_URL, json=payload, headers=headers)
193
+ # Check if request was successful
194
+ if response.status_code != 200:
195
+ logger.error(f"DeepInfra API error {response.status_code}: {response.text}")
196
+ # Return empty embeddings for failed batch
197
+ all_embeddings.extend([[] for _ in batch])
198
+ continue
199
+ data = response.json()
200
+ # Check for API error in response
201
+ if "detail" in data and "error" in data["detail"]:
202
+ logger.error(f"DeepInfra API error: {data['detail']['error']}")
203
+ # Return empty embeddings for failed batch
204
+ all_embeddings.extend([[] for _ in batch])
205
+ continue
206
+ if "data" not in data:
207
+ logger.error(f"Invalid response format: {data}")
208
+ # Return empty embeddings for failed batch
209
+ all_embeddings.extend([[] for _ in batch])
210
+ continue
211
+ batch_embs = [item["embedding"] for item in data["data"]]
212
+ all_embeddings.extend(batch_embs)
213
+ except requests.RequestException as e:
214
+ logger.error(f"Request failed: {e}")
215
+ # Return empty embeddings for failed batch
216
+ all_embeddings.extend([[] for _ in batch])
217
+ return all_embeddings
218
+
219
+
220
+
221
+ def deepinfra_rerank(batch: list[str], items_to_rerank: list[str]) -> list[str]:
222
+ payload = {
223
+ "model": config["apis_models"]["deepinfra"]["qwen"]["rerank"],
224
+ "input": batch}
225
+ headers = {
226
+ "Authorization": f"Bearer {DEEPINFRA_API_KEY}",
227
+ "Content-Type": "application/json"}
228
+ r = requests.post(
229
+ DEEPINFRA_RERANK_URL,
230
+ json=payload,
231
+ headers=headers,
232
+ timeout=60,)
233
+ if r.ok:
234
+ rerank_data = r.json()
235
+ ranked_docs = sorted(
236
+ zip(rerank_data.get("results", []), items_to_rerank),
237
+ key=lambda x: x[0].get("relevance_score", 0),
238
+ reverse=True)
239
+ reranked = ranked_docs[0][1] if ranked_docs else batch
240
+ return reranked
241
+ else:
242
+ return batch
243
+
244
+ def deepinfra_client():
245
+ return OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url=os.environ["DEEPINFRA_URL"],)
246
+
247
+
248
+ def qwen_generate(prompt: str) -> str:
249
+ """Streaming chat completion with Qwen on SiliconFlow and Groq via OpenAI client."""
250
+ if config["apis_models"]["num"] == 1:
251
+ return siliconflow_qwen_generate_content(prompt)
252
+ else:
253
+ return groq_qwen_generate_content(prompt)
254
+
255
+
256
+
257
+ if __name__ == "__main__":
258
+ # client = init_weaviate_client()
259
+ # if client is None:
260
+ # logger.error(f"api of weaviate is not working")
261
+ # client.close()
262
+
263
+ gen_prompt = Prompt_template_LLM_Generation.format(
264
+ new_query="what is autism")
265
+ logger.info(f"groq qwen generate.....: {groq_qwen_generate_content(gen_prompt)}")
266
+
267
+ print(f"=" * 50)
268
+ response = siliconflow_qwen_generate_content("what is autism")
269
+ logger.info(f"siliconflow qwen response: {response}")
270
+
271
+ print(f"=" * 50)
272
+
273
+ # Try DeepInfra first, then fallback to Groq
274
+ response = deepinfra_embedding(["what is autism"], 1)
275
+ if response and response[0]: # Check if we got valid embeddings
276
+ logger.info(f"deepinfra embedding response: {response}")
277
+ else:
278
+ raise ValueError("Empty embeddings returned")
279
+
280
+
281
+ print(f"=" * 50)
282
+ response = deepinfra_rerank(["what is autism"], ["what is autism"])
283
+ logger.info(f"deepinfra rerank response: {response}")
src/config.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ from pathlib import Path
4
+ from typing import Dict, Any
5
+ import logging
6
+ import sys
7
+
8
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
9
+
10
+ from .utilites import load_environment_variables, validate_api_keys
11
+
12
+
13
+ # Import logger here to avoid circular imports
14
+ try:
15
+ from logger.custom_logger import CustomLoggerTracker
16
+ custom_log = CustomLoggerTracker()
17
+ logger = custom_log.get_logger("config")
18
+ except ImportError:
19
+ # Fallback to standard logging if custom logger not available
20
+ logger = logging.getLogger("config")
21
+
22
+ class Config:
23
+ def __init__(self, config_path: str = "config.yaml"):
24
+ logger.info("Start Loading data from configs")
25
+ load_environment_variables()
26
+ self.config_path = Path(config_path)
27
+ self.config = self._load_config()
28
+ self._validate_config()
29
+
30
+ # Validate API keys
31
+ api_validation = validate_api_keys()
32
+ if not api_validation['valid']:
33
+ logger.warning(f"Some API keys missing: {api_validation['missing_required']}")
34
+ # Don't raise error for missing optional keys, just warn
35
+
36
+
37
+ def _load_config(self) -> Dict[str, Any]:
38
+ try:
39
+ if not self.config_path.exists():
40
+ logger.error(f"Configuration file not found: {self.config_path}")
41
+ raise FileNotFoundError(f"Configuration file not found: {self.config_path}")
42
+ with open(self.config_path, 'r', encoding='utf-8') as f:
43
+ config = yaml.safe_load(f)
44
+ logger.info(f"Configuration loaded successfully from {self.config_path}")
45
+ return config
46
+
47
+ except yaml.YAMLError as e:
48
+ logger.error(f"Error parsing YAML configuration: {e}")
49
+ raise
50
+
51
+ except Exception as e:
52
+ logger.error(f"Unexpected error loading configuration: {e}")
53
+ raise
54
+
55
+ def _validate_config(self) -> None:
56
+ """Validate configuration based on the actual YAML structure."""
57
+
58
+ # Check if we have either the old structure (gemini_model) or new structure (models + rag_system)
59
+ has_gemini = 'gemini_model' in self.config
60
+ has_models_section = 'models' in self.config
61
+ has_rag_section = 'rag_system' in self.config
62
+
63
+ if not has_gemini and not has_models_section:
64
+ logger.error("Missing required configuration: either 'gemini_model' or 'models' section must be configured")
65
+ raise ValueError("Missing required configuration: either 'gemini_model' or 'models' section must be configured")
66
+
67
+ # Validate models section if present
68
+ if has_models_section:
69
+ models_config = self.config['models']
70
+ required_models = ['embedding_model', 'llm_model']
71
+ for key in required_models:
72
+ if key not in models_config:
73
+ logger.error(f"Missing required model configuration: models.{key}")
74
+ raise ValueError(f"Missing required model configuration: models.{key}")
75
+
76
+ # Validate rag_system section if present (optional validation)
77
+ if has_rag_section:
78
+ rag_config = self.config['rag_system']
79
+ # These are optional but log if missing
80
+ optional_rag_keys = ['chunk_size', 'chunk_overlap', 'max_context_chunks']
81
+ for key in optional_rag_keys:
82
+ if key not in rag_config:
83
+ logger.debug(f"Optional RAG configuration key not found: rag_system.{key}")
84
+
85
+ # Validate vector store section if present
86
+ if 'vector_store' in self.config:
87
+ vector_config = self.config['vector_store']
88
+ if 'provider' in vector_config and vector_config['provider'] == 'qdrant':
89
+ # Check for qdrant specific config
90
+ if 'collection_name' not in vector_config:
91
+ logger.warning("Qdrant collection_name not specified, will use default")
92
+
93
+ logger.info("Configuration validation passed")
94
+
95
+ def get(self, key: str, default: Any = None) -> Any:
96
+ """Get configuration value by key, supporting nested keys with dot notation."""
97
+ keys = key.split('.')
98
+ value = self.config
99
+
100
+ try:
101
+ for k in keys:
102
+ value = value[k]
103
+ logger.debug(f"Retrieved config value for '{key}': {value}")
104
+ return value
105
+ except (KeyError, TypeError):
106
+ logger.debug(f"Config key '{key}' not found, returning default: {default}")
107
+ return default
108
+
109
+ def get_env_var(self, key: str, required: bool = True) -> str:
110
+ value = os.getenv(key)
111
+ if required and not value:
112
+ logger.error(f"Required environment variable not found: {key}")
113
+ raise ValueError(f"Required environment variable not found: {key}")
114
+ if value:
115
+ logger.info(f"Environment variable '{key}' loaded successfully")
116
+ else:
117
+ logger.warning(f"Optional environment variable '{key}' not found")
118
+ return value
119
+
120
+ @property
121
+ def gemini_model(self) -> str:
122
+ """Get Gemini model name (optional for RAG system)."""
123
+ return self.get('gemini_model', 'models/gemini-2.5-flash')
124
+
125
+ @property
126
+ def google_api_key(self) -> str:
127
+ """Get Google API key from environment."""
128
+ try:
129
+ return self.get_env_var('GOOGLE_API_KEY')
130
+ except ValueError:
131
+ logger.warning("Google API key not found, this is optional for RAG-only usage")
132
+ return ""
133
+
134
+ # RAG System Properties
135
+ @property
136
+ def rag_config(self) -> Dict[str, Any]:
137
+ """Get RAG system configuration, combining rag_system and models sections."""
138
+ rag_config = self.get('rag_system', {}).copy()
139
+
140
+ # Add models to rag config if they exist
141
+ models_config = self.get('models', {})
142
+ if models_config:
143
+ rag_config.update(models_config)
144
+
145
+ # Add performance settings
146
+ performance_config = self.get('performance', {})
147
+ if performance_config:
148
+ rag_config.update(performance_config)
149
+
150
+ return rag_config
151
+
152
+ @property
153
+ def groq_api_key(self) -> str:
154
+ """Get Groq API key from environment."""
155
+ return self.get_env_var('GROQ_API_KEY', required=False) or ""
156
+
157
+ @property
158
+ def siliconflow_api_key(self) -> str:
159
+ """Get Silicon Flow API key from environment."""
160
+ return self.get_env_var('SILICONFLOW_API_KEY', required=False) or ""
161
+
162
+ @property
163
+ def qdrant_url(self) -> str:
164
+ """Get Qdrant URL from environment or config."""
165
+ env_url = self.get_env_var('QDRANT_URL', required=False)
166
+ if env_url:
167
+ return env_url
168
+ return self.get('vector_store.qdrant_url', 'http://localhost:6333')
169
+
170
+ @property
171
+ def qdrant_api_key(self) -> str:
172
+ """Get Qdrant API key from environment."""
173
+ return self.get_env_var('QDRANT_API_KEY', required=False) or ""
174
+
175
+ @property
176
+ def document_processing_config(self) -> Dict[str, Any]:
177
+ """Get document processing configuration."""
178
+ return self.get('document_processing', {})
179
+
180
+ @property
181
+ def storage_config(self) -> Dict[str, Any]:
182
+ """Get storage configuration."""
183
+ # Combine multiple storage-related sections
184
+ storage_config = {}
185
+
186
+ # Vector store config
187
+ vector_store = self.get('vector_store', {})
188
+ if vector_store:
189
+ storage_config.update(vector_store)
190
+
191
+ # Cache config
192
+ cache_config = self.get('cache', {})
193
+ if cache_config:
194
+ storage_config.update(cache_config)
195
+
196
+ # Add any storage-specific settings
197
+ if 'storage' in self.config:
198
+ storage_config.update(self.config['storage'])
199
+
200
+ return storage_config
201
+
202
+ # Test the configuration loading
203
+ if __name__ == "__main__":
204
+ try:
205
+ config = Config()
206
+ print("✅ Configuration loaded successfully!")
207
+ print(f"RAG Config keys: {list(config.rag_config.keys())}")
208
+ print(f"Has Groq API key: {'Yes' if config.groq_api_key else 'No'}")
209
+ print(f"Has SiliconFlow API key: {'Yes' if config.siliconflow_api_key else 'No'}")
210
+ print(f"Qdrant URL: {config.qdrant_url}")
211
+ except Exception as e:
212
+ print(f"❌ Configuration failed: {e}")
src/config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Corrected RAG System Configuration
2
+ # This config works with the corrected code implementation
3
+
4
+ # API Configuration
5
+ apis:
6
+ siliconflow:
7
+ api_key: "${SILICONFLOW_API_KEY}"
8
+ base_url: "https://api.siliconflow.com/v1"
9
+ groq:
10
+ api_key: "${GROQ_API_KEY}"
11
+ base_url: "https://api.groq.com/openai/v1"
12
+
13
+ # Model Configuration
14
+ models:
15
+ # SiliconFlow Models
16
+ embedding_model: "Qwen/Qwen3-Embedding-8B"
17
+ reranker_model: "Qwen/Qwen3-Reranker-8B"
18
+
19
+ # Groq Models (OpenAI compatible)
20
+ llm_model: "openai/gpt-oss-120b" # Primary model
21
+ alternative_models:
22
+ - "openai/gpt-oss-20b" # Faster, less capable
23
+ - "qwen/qwen3-32b" # Good balance
24
+
25
+ # RAG System Configuration
26
+ rag_system:
27
+ # Chunking settings
28
+ chunk_size: 512
29
+ chunk_overlap: 50
30
+ max_context_chunks: 5
31
+
32
+
33
+
34
+ # Retrieval settings
35
+ similarity_threshold: 0.7
36
+ rerank_top_k: 20
37
+ final_top_k: 5
38
+
39
+ # Generation settings
40
+ temperature: 0.1
41
+ max_tokens: 4096
42
+
43
+ # Performance settings
44
+ batch_size: 32
45
+ max_retries: 3
46
+ enable_caching: true
47
+
48
+ # Text Processing Configuration
49
+ text_processing:
50
+ # Text splitting
51
+ separators: ["\n\n", "\n", " ", ""]
52
+
53
+ # Language settings
54
+ language: "korean"
55
+ encoding: "utf-8"
56
+
57
+ # Vector Store Configuration (if using)
58
+ vector_store:
59
+ provider: "qdrant"
60
+ collection_name: "manufacturing_docs"
61
+ vector_size: 1024
62
+ distance_metric: "cosine"
63
+
64
+ # Qdrant specific
65
+ qdrant_url: "${QDRANT_URL}"
66
+ qdrant_api_key: "${QDRANT_API_KEY}"
67
+
68
+ # Document Processing Configuration
69
+ document_processing:
70
+ # Supported formats
71
+ supported_formats: [".pdf", ".xlsx", ".xls", ".txt", ".md", ".png", ".jpg", ".jpeg"]
72
+
73
+ # File size limits
74
+ max_file_size_mb: 100
75
+
76
+ # Processing options
77
+ extract_images: true
78
+ extract_tables: true
79
+ ocr_enabled: true
80
+
81
+ # OCR settings
82
+ ocr_language: "kor+eng"
83
+ ocr_confidence_threshold: 0.6
84
+
85
+ # Logging Configuration
86
+ logging:
87
+ level: "INFO"
88
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
89
+ file_logging: true
90
+ log_file: "logs/rag_system.log"
91
+ max_file_size_mb: 10
92
+ backup_count: 5
93
+
94
+ # Performance Configuration
95
+ performance:
96
+ # Rate limiting
97
+ siliconflow_requests_per_minute: 60
98
+ groq_requests_per_minute: 30
99
+ api_timeout: 60 # Increased timeout for large documents
100
+ max_retries: 3
101
+ retry_delay: 5
102
+ batch_size: 32 # Consider reducing this for large documents
103
+ api_timeout_seconds: 30
104
+ max_workers: 4
105
+
106
+
107
+
108
+ # Cache Configuration
109
+ cache:
110
+ enabled: true
111
+ type: "memory" # Options: memory, redis, file
112
+ ttl_seconds: 3600
113
+ max_size: 1000
114
+
115
+ # Evaluation Configuration
116
+ evaluation:
117
+ metrics: ["accuracy", "relevance", "coherence"]
118
+ threshold: 0.8
119
+ save_results: true
120
+ results_dir: "evaluation_results"
121
+
122
+ # Development/Debug Configuration
123
+ debug:
124
+ verbose_logging: false
125
+ save_intermediate_results: false
126
+ profiling_enabled: false
127
+
128
+
129
+
src/document_processor.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Any, Union
8
+ from enum import Enum
9
+ import hashlib
10
+ import sys
11
+ import os
12
+
13
+
14
+
15
+ sys.path.append(os.path.dirname(os.path.abspath(__file__))) # Ensure current directory is in
16
+
17
+ try:
18
+ from logger.custom_logger import CustomLoggerTracker
19
+ custom_log = CustomLoggerTracker()
20
+ logger = custom_log.get_logger("documents_processor")
21
+
22
+ except ImportError:
23
+ # Fallback to standard logging if custom logger not available
24
+ logger = logging.getLogger("documents_processor")
25
+
26
+
27
+
28
+ class DocumentType(Enum):
29
+ """Supported document types."""
30
+ PDF = "pdf"
31
+ EXCEL = "excel"
32
+ IMAGE = "image"
33
+ UNKNOWN = "unknown"
34
+
35
+
36
+ class ProcessingStatus(Enum):
37
+ """Document processing status."""
38
+ PENDING = "pending"
39
+ PROCESSING = "processing"
40
+ COMPLETED = "completed"
41
+ FAILED = "failed"
42
+ PARTIAL = "partial"
43
+
44
+
45
+ @dataclass
46
+ class ExtractedImage:
47
+ """Represents an image extracted from a document."""
48
+ image_id: str
49
+ filename: str
50
+ content: bytes
51
+ format: str # PNG, JPEG, etc.
52
+ width: Optional[int] = None
53
+ height: Optional[int] = None
54
+ ocr_text: Optional[str] = None
55
+ ocr_confidence: Optional[float] = None
56
+ extraction_method: str = "unknown"
57
+ metadata: Dict[str, Any] = field(default_factory=dict)
58
+
59
+
60
+ @dataclass
61
+ class ExtractedTable:
62
+ """Represents a table extracted from a document."""
63
+ table_id: str
64
+ headers: List[str]
65
+ rows: List[List[str]]
66
+ page_number: Optional[int] = None
67
+ worksheet_name: Optional[str] = None
68
+ cell_range: Optional[str] = None
69
+ extraction_confidence: Optional[float] = None
70
+ metadata: Dict[str, Any] = field(default_factory=dict)
71
+
72
+
73
+ @dataclass
74
+ class ChunkMetadata:
75
+ """Metadata for a document chunk."""
76
+ chunk_id: str
77
+ document_id: str
78
+ chunk_index: int
79
+ page_number: Optional[int] = None
80
+ worksheet_name: Optional[str] = None
81
+ cell_range: Optional[str] = None
82
+ section_title: Optional[str] = None
83
+ image_references: List[str] = field(default_factory=list)
84
+ table_references: List[str] = field(default_factory=list)
85
+ extraction_timestamp: datetime = field(default_factory=datetime.now)
86
+ confidence_score: Optional[float] = None
87
+
88
+
89
+ @dataclass
90
+ class DocumentChunk:
91
+ """Represents a chunk of processed document content."""
92
+ content: str
93
+ metadata: ChunkMetadata
94
+ embedding: Optional[List[float]] = None
95
+
96
+ def __post_init__(self):
97
+ """Validate chunk content after initialization."""
98
+ if not self.content.strip():
99
+ logger.warning(f"Empty content in chunk {self.metadata.chunk_id}")
100
+
101
+ if len(self.content) > 10000: # Warn for very large chunks
102
+ logger.warning(f"Large chunk detected ({len(self.content)} chars): {self.metadata.chunk_id}")
103
+
104
+
105
+ @dataclass
106
+ class ProcessedDocument:
107
+ """Represents a fully processed document with all extracted content."""
108
+ document_id: str
109
+ filename: str
110
+ file_path: str
111
+ document_type: DocumentType
112
+ content: str
113
+ metadata: Dict[str, Any]
114
+ images: List[ExtractedImage] = field(default_factory=list)
115
+ tables: List[ExtractedTable] = field(default_factory=list)
116
+ processing_status: ProcessingStatus = ProcessingStatus.PENDING
117
+ processing_timestamp: datetime = field(default_factory=datetime.now)
118
+ file_size: int = 0
119
+ checksum: str = ""
120
+ error_message: Optional[str] = None
121
+
122
+ def __post_init__(self):
123
+ """Generate checksum and validate document after initialization."""
124
+ if not self.checksum and Path(self.file_path).exists():
125
+ self.checksum = self._generate_checksum()
126
+ self.file_size = Path(self.file_path).stat().st_size
127
+
128
+ def _generate_checksum(self) -> str:
129
+ try:
130
+ hash_md5 = hashlib.md5()
131
+ with open(self.file_path, "rb") as f:
132
+ for chunk in iter(lambda: f.read(1024), b""):
133
+ hash_md5.update(chunk)
134
+ return hash_md5.hexdigest()
135
+ except Exception as e:
136
+ logger.error(f"Failed to generate checksum for {self.file_path}: {e}")
137
+ return ""
138
+
139
+
140
+ class DocumentProcessingError(Exception):
141
+ """Base exception for document processing errors."""
142
+
143
+ def __init__(self, file_path: str, error_type: str, details: str):
144
+ self.file_path = file_path
145
+ self.error_type = error_type
146
+ self.details = details
147
+ super().__init__(f"Document processing error in {file_path}: {error_type} - {details}")
148
+
149
+
150
+ class UnsupportedDocumentTypeError(DocumentProcessingError):
151
+ def __init__(self, file_path: str, detected_type: str):
152
+ super().__init__(
153
+ file_path,
154
+ "UnsupportedDocumentType",
155
+ f"Document type '{detected_type}' is not supported"
156
+ )
157
+
158
+
159
+ class DocumentProcessor(ABC):
160
+ def __init__(self, config: Dict[str, Any]):
161
+ self.config = config
162
+ self.supported_extensions = self._get_supported_extensions()
163
+ logger.info(f"Initialized {self.__class__.__name__} with config: {config}")
164
+
165
+ @abstractmethod
166
+ def _get_supported_extensions(self) -> List[str]:
167
+ pass
168
+
169
+
170
+ @abstractmethod
171
+ def process_document(self, file_path: str) -> ProcessedDocument:
172
+
173
+ pass
174
+
175
+ def can_process(self, file_path: str) -> bool:
176
+ file_extension = Path(file_path).suffix.lower()
177
+ return file_extension in self.supported_extensions
178
+
179
+
180
+ def extract_chunks(self, document: ProcessedDocument, chunk_size: int = 512,
181
+ chunk_overlap: int = 50) -> List[DocumentChunk]:
182
+ if not document.content.strip():
183
+ logger.warning(f"No content to chunk in document {document.document_id}")
184
+ return []
185
+
186
+ chunks = []
187
+ content = document.content
188
+ start = 0
189
+ chunk_index = 0
190
+
191
+ while start < len(content):
192
+ # Calculate end position
193
+ end = min(start + chunk_size, len(content))
194
+
195
+ # Try to break at word boundary if not at end of content
196
+ if end < len(content):
197
+ # Look for the last space within the chunk
198
+ last_space = content.rfind(' ', start, end)
199
+ if last_space > start:
200
+ end = last_space
201
+
202
+ # Extract chunk content
203
+ chunk_content = content[start:end].strip()
204
+
205
+ if chunk_content: # Only create chunk if it has content
206
+ # Create chunk metadata
207
+ metadata = ChunkMetadata(
208
+ chunk_id=f"{document.document_id}_chunk_{chunk_index}",
209
+ document_id=document.document_id,
210
+ chunk_index=chunk_index
211
+ )
212
+
213
+ # Create chunk
214
+ chunk = DocumentChunk(
215
+ content=chunk_content,
216
+ metadata=metadata
217
+ )
218
+
219
+ chunks.append(chunk)
220
+ chunk_index += 1
221
+
222
+ # Move start position with overlap
223
+ start = max(end - chunk_overlap, start + 1)
224
+
225
+ # Prevent infinite loop
226
+ if start >= end:
227
+ break
228
+
229
+ logger.info(f"Created {len(chunks)} chunks from document {document.document_id}")
230
+ return chunks
231
+
232
+ def _detect_document_type(self, file_path: str) -> DocumentType:
233
+ extension = Path(file_path).suffix.lower()
234
+
235
+ if extension == '.pdf':
236
+ return DocumentType.PDF
237
+ elif extension in ['.xlsx', '.xls', '.xlsm']:
238
+ return DocumentType.EXCEL
239
+ elif extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']:
240
+ return DocumentType.IMAGE
241
+ else:
242
+ return DocumentType.UNKNOWN
243
+
244
+ def _generate_document_id(self, file_path: str) -> str:
245
+ """
246
+ Generate a unique document ID based on file path and timestamp.
247
+
248
+ Args:
249
+ file_path: Path to the document file
250
+
251
+ Returns:
252
+ Unique document ID string
253
+ """
254
+ file_name = Path(file_path).name
255
+ timestamp = datetime.now().isoformat()
256
+ content = f"{file_name}_{timestamp}"
257
+ return hashlib.md5(content.encode()).hexdigest()
258
+
259
+ def validate_file(self, file_path: str) -> None:
260
+ """
261
+ Validate that a file exists and can be processed.
262
+
263
+ Args:
264
+ file_path: Path to the file to validate
265
+
266
+ Raises:
267
+ DocumentProcessingError: If file validation fails
268
+ """
269
+ file_path_obj = Path(file_path)
270
+
271
+ if not file_path_obj.exists():
272
+ raise DocumentProcessingError(
273
+ file_path,
274
+ "FileNotFound",
275
+ f"File does not exist: {file_path}"
276
+ )
277
+
278
+ if not file_path_obj.is_file():
279
+ raise DocumentProcessingError(
280
+ file_path,
281
+ "NotAFile",
282
+ f"Path is not a file: {file_path}"
283
+ )
284
+
285
+ # Check file size
286
+ max_size = self.config.get('max_file_size_mb', 100) * 1024 * 1024 # Convert to bytes
287
+ file_size = file_path_obj.stat().st_size
288
+
289
+ if file_size > max_size:
290
+ raise DocumentProcessingError(
291
+ file_path,
292
+ "FileTooLarge",
293
+ f"File size ({file_size} bytes) exceeds maximum allowed size ({max_size} bytes)"
294
+ )
295
+
296
+ if not self.can_process(file_path):
297
+ detected_type = self._detect_document_type(file_path)
298
+ raise UnsupportedDocumentTypeError(file_path, detected_type.value)
299
+
300
+ logger.debug(f"File validation passed for: {file_path}")
301
+
302
+
303
+ class DocumentProcessorFactory:
304
+ """Factory class for creating appropriate document processors."""
305
+
306
+ _processors = {}
307
+
308
+ @classmethod
309
+ def register_processor(cls, document_type: DocumentType, processor_class):
310
+ """Register a processor class for a document type."""
311
+ cls._processors[document_type] = processor_class
312
+ logger.info(f"Registered processor {processor_class.__name__} for type {document_type.value}")
313
+
314
+ @classmethod
315
+ def create_processor(cls, file_path: str, config: Dict[str, Any]) -> DocumentProcessor:
316
+ """
317
+ Create appropriate processor for the given file.
318
+
319
+ Args:
320
+ file_path: Path to the file to process
321
+ config: Configuration dictionary
322
+
323
+ Returns:
324
+ DocumentProcessor instance
325
+
326
+ Raises:
327
+ UnsupportedDocumentTypeError: If no processor is available for the file type
328
+ """
329
+ # Detect document type
330
+ extension = Path(file_path).suffix.lower()
331
+
332
+ if extension == '.pdf':
333
+ document_type = DocumentType.PDF
334
+ elif extension in ['.xlsx', '.xls', '.xlsm']:
335
+ document_type = DocumentType.EXCEL
336
+ elif extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']:
337
+ document_type = DocumentType.IMAGE
338
+ else:
339
+ document_type = DocumentType.UNKNOWN
340
+
341
+ # Get processor class
342
+ processor_class = cls._processors.get(document_type)
343
+ if not processor_class:
344
+ raise UnsupportedDocumentTypeError(file_path, document_type.value)
345
+
346
+ # Create and return processor instance
347
+ return processor_class(config)
348
+
349
+ @classmethod
350
+ def get_supported_types(cls) -> List[DocumentType]:
351
+ """Get list of supported document types."""
352
+ return list(cls._processors.keys())
353
+
354
+
355
+
356
+
357
+ if __name__=="__main__":
358
+ logger.info(f"Docs processor init ..")
359
+ # Example usage (for testing purposes)
360
+ config = {'max_file_size_mb': 50}
361
+ processor = DocumentProcessorFactory.create_processor("example.pdf", config)
362
+ processed_doc = processor.process_document("example.pdf")
363
+ chunks = processor.extract_chunks(processed_doc)
364
+ for chunk in chunks:
365
+ print(chunk)
src/embedding_system.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import requests
3
+ import time
4
+ import os
5
+ import sys
6
+ from typing import Dict, List, Any, Optional
7
+ from dataclasses import dataclass
8
+ from dotenv import load_dotenv
9
+ import json
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+
16
+ try:
17
+ from logger.custom_logger import CustomLoggerTracker
18
+ custom_log = CustomLoggerTracker()
19
+ logger = custom_log.get_logger("embedding_system_fixed")
20
+ except ImportError:
21
+ # Fallback to standard logging if custom logger not available
22
+ logger = logging.getLogger("embedding_system_fixed")
23
+
24
+
25
+
26
+ SILICONFLOW_API_KEY = os.environ['SILICONFLOW_API_KEY']
27
+ @dataclass
28
+ class EmbeddingResult:
29
+ """Result of embedding generation."""
30
+ embeddings: List[List[float]]
31
+ model_name: str
32
+ processing_time: float
33
+ token_count: int
34
+ success: bool
35
+ error_message: Optional[str] = None
36
+
37
+
38
+ @dataclass
39
+ class RerankResult:
40
+ """Result of reranking operation."""
41
+ text: str
42
+ score: float
43
+ index: int
44
+
45
+
46
+ class EmbeddingSystem:
47
+ def __init__(self, config: Dict[str, Any]):
48
+ self.config = config
49
+
50
+ # Get API configuration
51
+ self.api_key = SILICONFLOW_API_KEY
52
+ if not self.api_key:
53
+ raise ValueError("SiliconFlow API key is required")
54
+
55
+ # API endpoints
56
+ self.base_url = "https://api.siliconflow.com/v1"
57
+ self.session = requests.Session()
58
+ self.session.headers.update({
59
+ 'Authorization': f'Bearer {self.api_key}',
60
+ 'Content-Type': 'application/json'
61
+ })
62
+
63
+ # Model configuration from your config
64
+ self.embedding_model = config.get('embedding_model', 'Qwen/Qwen3-Embedding-8B')
65
+ self.reranker_model = config.get('reranker_model', 'Qwen/Qwen3-Reranker-8B')
66
+
67
+ # Rate limiting
68
+ self.max_requests_per_minute = 60
69
+ self.request_timestamps = []
70
+
71
+ logger.info(f"EmbeddingSystem initialized with model: {self.embedding_model}")
72
+
73
+
74
+ def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
75
+ if isinstance(texts, str):
76
+ texts = [texts]
77
+
78
+ if not texts:
79
+ logger.warning("No texts provided for embedding generation")
80
+ return []
81
+
82
+ try:
83
+ self._check_rate_limit()
84
+
85
+ payload = {
86
+ "model": self.embedding_model,
87
+ "input": texts,
88
+ "encoding_format": "float"
89
+ }
90
+
91
+ response = self.session.post(
92
+ f"{self.base_url}/embeddings",
93
+ json=payload,
94
+ timeout=30
95
+ )
96
+
97
+ if response.status_code == 200:
98
+ data = response.json()
99
+ embeddings = [item['embedding'] for item in data.get('data', [])]
100
+
101
+ if len(embeddings) != len(texts):
102
+ logger.warning(f"Expected {len(texts)} embeddings, got {len(embeddings)}")
103
+
104
+ logger.debug(f"Generated {len(embeddings)} embeddings")
105
+ return embeddings
106
+
107
+ else:
108
+ error_msg = f"SiliconFlow API error {response.status_code}: {response.text}"
109
+ logger.error(error_msg)
110
+ return []
111
+
112
+ except Exception as e:
113
+ logger.error(f"Embedding generation failed: {e}")
114
+ return []
115
+
116
+ def generate_query_embedding(self, query: str) -> List[float]:
117
+ embeddings = self.generate_embeddings([query])
118
+ return embeddings[0] if embeddings else []
119
+
120
+ def rerank_documents(self, query: str, documents: List[str],
121
+ top_k: Optional[int] = None) -> List[RerankResult]:
122
+ if not documents:
123
+ return []
124
+
125
+ try:
126
+ self._check_rate_limit()
127
+
128
+ payload = {
129
+ "model": self.reranker_model,
130
+ "query": query,
131
+ "documents": documents,
132
+ "top_k": top_k or len(documents),
133
+ "return_documents": True
134
+ }
135
+
136
+ response = self.session.post(
137
+ f"{self.base_url}/rerank",
138
+ json=payload,
139
+ timeout=30
140
+ )
141
+
142
+ if response.status_code == 200:
143
+ data = response.json()
144
+ results = []
145
+
146
+ for item in data.get('results', []):
147
+ results.append(RerankResult(
148
+ text=item.get('document', {}).get('text', ''),
149
+ score=item.get('relevance_score', 0.0),
150
+ index=item.get('index', 0)
151
+ ))
152
+
153
+ # Sort by score (descending)
154
+ results.sort(key=lambda x: x.score, reverse=True)
155
+ logger.debug(f"Reranked {len(results)} documents")
156
+ return results
157
+
158
+ else:
159
+ error_msg = f"SiliconFlow rerank API error {response.status_code}: {response.text}"
160
+ logger.error(error_msg)
161
+ return []
162
+
163
+ except Exception as e:
164
+ logger.error(f"Reranking failed: {e}")
165
+ return []
166
+
167
+ def rerank_results(self, query: str, documents: List[str], top_k: Optional[int] = None) -> List[RerankResult]:
168
+ """Alias for rerank_documents to match the interface expected by rag_engine."""
169
+ return self.rerank_documents(query, documents, top_k)
170
+
171
+ def _check_rate_limit(self):
172
+ """Check and enforce rate limiting."""
173
+ current_time = time.time()
174
+
175
+ # Remove timestamps older than 1 minute
176
+ self.request_timestamps = [
177
+ ts for ts in self.request_timestamps
178
+ if current_time - ts < 60
179
+ ]
180
+
181
+ # Check if we're at the rate limit
182
+ if len(self.request_timestamps) >= self.max_requests_per_minute:
183
+ sleep_time = 60 - (current_time - self.request_timestamps[0])
184
+ if sleep_time > 0:
185
+ logger.warning(f"Rate limit reached, sleeping for {sleep_time:.2f} seconds")
186
+ time.sleep(sleep_time)
187
+
188
+ # Add current request timestamp
189
+ self.request_timestamps.append(current_time)
190
+
191
+ def test_api_connection(self) -> Dict[str, Any]:
192
+ """Test the API connection."""
193
+ if not self.api_key:
194
+ return {
195
+ 'success': False,
196
+ 'error': 'API key not set',
197
+ 'details': 'Please set the SILICONFLOW_API_KEY environment variable'
198
+ }
199
+
200
+ try:
201
+ # Test with a simple embedding request
202
+ test_payload = {
203
+ "model": self.embedding_model,
204
+ "input": ["test connection"],
205
+ "encoding_format": "float"
206
+ }
207
+
208
+ response = self.session.post(
209
+ f"{self.base_url}/embeddings",
210
+ json=test_payload,
211
+ timeout=10
212
+ )
213
+
214
+ if response.status_code == 200:
215
+ return {
216
+ 'success': True,
217
+ 'message': 'API connection successful',
218
+ 'status_code': response.status_code,
219
+ 'model': self.embedding_model
220
+ }
221
+ else:
222
+ return {
223
+ 'success': False,
224
+ 'error': f'API error {response.status_code}',
225
+ 'details': response.text[:200],
226
+ 'status_code': response.status_code
227
+ }
228
+
229
+ except Exception as e:
230
+ return {
231
+ 'success': False,
232
+ 'error': 'Connection failed',
233
+ 'details': str(e)
234
+ }
235
+
236
+ def get_cache_stats(self) -> dict:
237
+ """Get cache statistics (placeholder for compatibility)."""
238
+ return {
239
+ "caching_disabled": True,
240
+ "note": "Caching not implemented in this version"
241
+ }
242
+
243
+
244
+ # Test function
245
+ def test_embedding_system():
246
+ """Test the embedding system with your configuration."""
247
+ print("🧪 Testing SiliconFlow Embedding System")
248
+ print("-" * 40)
249
+
250
+ # Test configuration
251
+ config = {
252
+ 'siliconflow_api_key': os.getenv('SILICONFLOW_API_KEY'),
253
+ 'embedding_model': 'Qwen/Qwen3-Embedding-8B',
254
+ 'reranker_model': 'Qwen/Qwen3-Reranker-8B'
255
+ }
256
+
257
+ try:
258
+ # Initialize system
259
+ embedding_system = EmbeddingSystem(config)
260
+ print("✅ System initialized")
261
+
262
+ # Test API connection
263
+ connection_test = embedding_system.test_api_connection()
264
+ if connection_test['success']:
265
+ print("✅ API connection successful")
266
+ else:
267
+ print(f"❌ API connection failed: {connection_test['error']}")
268
+ return
269
+
270
+ # Test embedding generation
271
+ test_texts = [
272
+ "What is the production yield?",
273
+ "How is quality controlled in manufacturing?",
274
+ "What safety measures are in place?"
275
+ ]
276
+
277
+ print(f"🔄 Generating embeddings for {len(test_texts)} texts...")
278
+ embeddings = embedding_system.generate_embeddings(test_texts)
279
+
280
+ if embeddings and len(embeddings) == len(test_texts):
281
+ print(f"✅ Generated {len(embeddings)} embeddings of size {len(embeddings[0])}")
282
+ else:
283
+ print(f"❌ Embedding generation failed. Got {len(embeddings)} embeddings")
284
+ return
285
+
286
+ # Test reranking
287
+ query = "manufacturing quality control"
288
+ documents = [
289
+ "Quality control processes ensure product reliability",
290
+ "Manufacturing efficiency can be improved through automation",
291
+ "Safety protocols are essential in industrial settings"
292
+ ]
293
+
294
+ print(f"🔄 Testing reranking with query: '{query}'")
295
+ rerank_results = embedding_system.rerank_documents(query, documents)
296
+
297
+ if rerank_results:
298
+ print(f"✅ Reranking successful. Top result score: {rerank_results[0].score:.3f}")
299
+ for i, result in enumerate(rerank_results):
300
+ print(f" {i+1}. Score: {result.score:.3f} - {result.text[:50]}...")
301
+ else:
302
+ print("❌ Reranking failed")
303
+ return
304
+
305
+ print("\n🎉 All tests passed successfully!")
306
+
307
+ except Exception as e:
308
+ print(f"❌ Test failed: {e}")
309
+
310
+
311
+ if __name__ == "__main__":
312
+ test_embedding_system()
src/evaluation_bench/dataset_loader.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from deepeval.dataset import EvaluationDataset
5
+ from deepeval.test_case import LLMTestCase
6
+ import logging
7
+ from typing import List, Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class DatasetLoader:
12
+ """Handle loading and processing of evaluation datasets."""
13
+
14
+ def __init__(self):
15
+ self.dataset = EvaluationDataset()
16
+
17
+ def load_from_csv(self,
18
+ file_path: str,
19
+ input_col: str = "input",
20
+ output_col: str = "expected_output",
21
+ context_col: Optional[str] = None) -> EvaluationDataset:
22
+ """
23
+ Load dataset from CSV file with comprehensive logging.
24
+
25
+ Args:
26
+ file_path: Path to the CSV file
27
+ input_col: Column name for input questions
28
+ output_col: Column name for expected outputs
29
+ context_col: Optional column name for context
30
+
31
+ Returns:
32
+ EvaluationDataset: Loaded dataset
33
+ """
34
+ try:
35
+ file_path = Path(file_path)
36
+
37
+ if not file_path.exists():
38
+ logger.error(f"Dataset file not found: {file_path}")
39
+ raise FileNotFoundError(f"Dataset file not found: {file_path}")
40
+
41
+ logger.info(f"Loading dataset from: {file_path}")
42
+
43
+ # Read CSV file
44
+ df = pd.read_csv(file_path)
45
+ logger.info(f"CSV file loaded successfully. Shape: {df.shape}")
46
+
47
+ # Validate required columns
48
+ required_cols = [input_col, output_col]
49
+ missing_cols = [col for col in required_cols if col not in df.columns]
50
+
51
+ if missing_cols:
52
+ logger.error(f"Missing required columns: {missing_cols}")
53
+ logger.error(f"Available columns: {list(df.columns)}")
54
+ raise ValueError(f"Missing required columns: {missing_cols}")
55
+
56
+ # Log column information
57
+ logger.info(f"Dataset columns: {list(df.columns)}")
58
+ logger.info(f"Input column: {input_col}")
59
+ logger.info(f"Output column: {output_col}")
60
+ if context_col:
61
+ logger.info(f"Context column: {context_col}")
62
+
63
+ # Clean and validate data
64
+ df = self._clean_data(df, input_col, output_col)
65
+
66
+ # Load test cases
67
+ self.dataset.add_test_cases_from_csv_file(
68
+ file_path=str(file_path),
69
+ input_col_name=input_col,
70
+ actual_output_col_name=output_col,
71
+ )
72
+
73
+ logger.info(f"Successfully loaded {len(self.dataset.test_cases)} test cases")
74
+
75
+ # Log sample data
76
+ self._log_sample_data(df, input_col, output_col)
77
+
78
+ return self.dataset
79
+
80
+ except Exception as e:
81
+ logger.error(f"Error loading dataset: {e}")
82
+ raise
83
+
84
+ def _clean_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> pd.DataFrame:
85
+ """Clean and validate dataset."""
86
+ logger.info("Cleaning dataset...")
87
+
88
+ initial_count = len(df)
89
+
90
+ # Remove rows with missing values in required columns
91
+ df = df.dropna(subset=[input_col, output_col])
92
+
93
+ # Remove empty strings
94
+ df = df[df[input_col].str.strip() != '']
95
+ df = df[df[output_col].str.strip() != '']
96
+
97
+ final_count = len(df)
98
+ removed_count = initial_count - final_count
99
+
100
+ if removed_count > 0:
101
+ logger.warning(f"Removed {removed_count} invalid rows during cleaning")
102
+
103
+ logger.info(f"Dataset cleaned. Final count: {final_count} rows")
104
+
105
+ return df
106
+
107
+ def _log_sample_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> None:
108
+ """Log sample data for verification."""
109
+ logger.info("Sample data from dataset:")
110
+
111
+ for i, row in df.head(3).iterrows():
112
+ logger.info(f"Sample {i+1}:")
113
+ logger.info(f" Input: {row[input_col][:100]}...")
114
+ logger.info(f" Output: {row[output_col][:100]}...")
115
+
116
+ def get_dataset_stats(self) -> dict:
117
+ """Get dataset statistics."""
118
+ if not self.dataset.test_cases:
119
+ return {"total_cases": 0}
120
+
121
+ stats = {
122
+ "total_cases": len(self.dataset.test_cases),
123
+ "avg_input_length": sum(len(case.input) for case in self.dataset.test_cases) / len(self.dataset.test_cases),
124
+ "avg_output_length": sum(len(case.actual_output or "") for case in self.dataset.test_cases) / len(self.dataset.test_cases)
125
+ }
126
+
127
+ logger.info(f"Dataset statistics: {stats}")
128
+ return stats
src/evaluation_bench/deep_eval.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ import logging
4
+
5
+ # Add src to path
6
+ sys.path.append(str(Path(__file__).parent / "src"))
7
+
8
+ from src.logger import setup_logging
9
+ from src.config import Config
10
+ from src.dataset_loader import DatasetLoader
11
+ from src.evaluator import KoreanQAEvaluator
12
+
13
+ def run_legacy_evaluation():
14
+ """Run evaluation using the legacy approach but with new logging."""
15
+ # Setup logging
16
+ logger_setup = setup_logging(log_level="INFO")
17
+ logger = logger_setup.get_logger(__name__)
18
+
19
+ logger.warning("Using legacy evaluation script. Consider migrating to main.py")
20
+
21
+ try:
22
+ # Load configuration
23
+ script_dir = Path(__file__).parent
24
+ config_path = script_dir / "src" / "config.yaml"
25
+ config = Config(str(config_path))
26
+
27
+ # Log evaluation start
28
+ dataset_path = script_dir / "assets" / "bench_korean.csv"
29
+ logger_setup.log_evaluation_start(str(dataset_path), config.gemini_model)
30
+
31
+ # Load dataset
32
+ dataset_loader = DatasetLoader()
33
+ dataset = dataset_loader.load_from_csv(str(dataset_path))
34
+
35
+ # Initialize evaluator
36
+ evaluator = KoreanQAEvaluator(
37
+ model_name=config.gemini_model,
38
+ api_key=config.google_api_key,
39
+ threshold=0.8,
40
+ verbose_mode=True
41
+ )
42
+
43
+ # Run evaluation
44
+ results = evaluator.evaluate_dataset(dataset)
45
+
46
+ # Save results
47
+ output_path = evaluator.save_results(results)
48
+
49
+ # Log evaluation end
50
+ logger_setup.log_evaluation_end(results)
51
+
52
+ logger.info(f"Legacy evaluation completed. Results saved to: {output_path}")
53
+
54
+ except Exception as e:
55
+ logger.error(f"Legacy evaluation failed: {e}")
56
+ raise
57
+
58
+ if __name__ == "__main__":
59
+ run_legacy_evaluation()
src/evaluation_bench/evaluator.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any
3
+ from deepeval import evaluate
4
+ from deepeval.metrics import AnswerRelevancyMetric, ContextualPrecisionMetric
5
+ from deepeval.models import GeminiModel
6
+ from deepeval.dataset import EvaluationDataset
7
+ from deepeval.test_case import LLMTestCase
8
+ import json
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class KoreanQAEvaluator:
15
+ """Main evaluator for Korean Q&A systems."""
16
+
17
+ def __init__(self,
18
+ model_name: str,
19
+ api_key: str,
20
+ threshold: float = 0.8,
21
+ verbose_mode: bool = True,
22
+ reason : bool = True):
23
+ """
24
+ Initialize the evaluator.
25
+
26
+ Args:
27
+ model_name: Name of the Gemini model to use
28
+ api_key: Google API key
29
+ threshold: Threshold for pass/fail evaluation
30
+ verbose_mode: Enable verbose logging
31
+ """
32
+ self.model_name = model_name
33
+ self.threshold = threshold
34
+ self.verbose_mode = verbose_mode
35
+ self.include_reason = reason
36
+
37
+ try:
38
+ logger.info(f"Initializing Gemini model: {model_name}")
39
+ self.eval_model = GeminiModel(model_name=model_name, api_key=api_key)
40
+ logger.info("Gemini model initialized successfully")
41
+ except Exception as e:
42
+ logger.error(f"Failed to initialize Gemini model: {e}")
43
+ raise
44
+
45
+ self._setup_metrics()
46
+
47
+ def _setup_metrics(self) -> None:
48
+ try:
49
+ logger.info("Setting up evaluation metrics...")
50
+
51
+ self.answer_relevancy_metric = AnswerRelevancyMetric(
52
+ threshold=self.threshold,
53
+ model=self.eval_model,
54
+ verbose_mode=self.verbose_mode,
55
+ include_reason=self.include_reason
56
+ )
57
+
58
+ # Optionally add more metrics
59
+ # self.contextual_precision_metric = ContextualPrecisionMetric(
60
+ # threshold=self.threshold,
61
+ # model=self.eval_model
62
+ # )
63
+
64
+ self.metrics = [self.answer_relevancy_metric]
65
+
66
+ logger.info(f"Metrics setup completed. Active metrics: {len(self.metrics)}")
67
+
68
+ except Exception as e:
69
+ logger.error(f"Failed to setup metrics: {e}")
70
+ raise
71
+
72
+ def evaluate_dataset(self, dataset: EvaluationDataset) -> Dict[str, Any]:
73
+ try:
74
+ logger.info("Starting dataset evaluation...")
75
+ logger.info(f"Total test cases: {len(dataset.test_cases)}")
76
+ logger.info(f"Evaluation threshold: {self.threshold}")
77
+ logger.info(f"Verbose mode: {self.verbose_mode}")
78
+
79
+ # Run evaluation - DeepEval modifies test_cases in place
80
+ evaluate(dataset.test_cases, self.metrics)
81
+
82
+ # Process and log results
83
+ processed_results = self._process_results(dataset.test_cases)
84
+
85
+ logger.info("Dataset evaluation completed successfully")
86
+ return processed_results
87
+
88
+ except Exception as e:
89
+ logger.error(f"Error during dataset evaluation: {e}")
90
+ raise
91
+
92
+ def evaluate_single_case(self,
93
+ input_text: str,
94
+ actual_output: str,
95
+ expected_output: str = None,
96
+ context: List[str] = None) -> Dict[str, Any]:
97
+ try:
98
+ logger.info("Evaluating single test case...")
99
+ logger.debug(f"Input: {input_text[:100]}...")
100
+ logger.debug(f"Output: {actual_output[:100]}...")
101
+
102
+ test_case = LLMTestCase(
103
+ input=input_text,
104
+ actual_output=actual_output,
105
+ expected_output=expected_output,
106
+ context=context or [])
107
+
108
+ # Run evaluation - DeepEval modifies test_case in place
109
+ evaluate([test_case], self.metrics)
110
+
111
+ # Debug: Check what's in the test case after evaluation
112
+ logger.debug(f"Test case attributes after evaluation: {dir(test_case)}")
113
+ if hasattr(test_case, 'metrics_metadata'):
114
+ logger.debug(f"Metrics metadata found: {test_case.metrics_metadata}")
115
+ else:
116
+ logger.debug("No metrics_metadata attribute found")
117
+
118
+ processed_results = self._process_results([test_case])
119
+ logger.info("Single case evaluation completed")
120
+ return processed_results
121
+ except Exception as e:
122
+ logger.error(f"Error during single case evaluation: {e}")
123
+ raise
124
+
125
+ def _process_results(self, test_cases: List[LLMTestCase]) -> Dict[str, Any]:
126
+ """Process and analyze evaluation results."""
127
+ logger.info("Processing evaluation results...")
128
+
129
+ # Extract scores and metrics
130
+ scores = []
131
+ passed_cases = 0
132
+ failed_cases = 0
133
+
134
+ detailed_results = []
135
+
136
+ # Process results from DeepEval
137
+ # After evaluation, DeepEval stores results in test_case.metrics_metadata
138
+ for i, test_case in enumerate(test_cases):
139
+ case_result = {
140
+ "case_index": i,
141
+ "input": test_case.input,
142
+ "actual_output": test_case.actual_output,
143
+ "expected_output": test_case.expected_output,
144
+ "metrics": {}
145
+ }
146
+
147
+ # Check multiple possible locations for results
148
+ metrics_found = False
149
+
150
+ # Method 1: Check metrics_metadata (most common)
151
+ if hasattr(test_case, 'metrics_metadata') and test_case.metrics_metadata:
152
+ logger.debug(f"Found metrics_metadata for case {i+1}")
153
+ for metric_metadata in test_case.metrics_metadata:
154
+ metric_name = metric_metadata.metric
155
+ score = metric_metadata.score
156
+ passed = metric_metadata.success
157
+ reason = getattr(metric_metadata, 'reason', '')
158
+
159
+ scores.append(score)
160
+ case_result["metrics"][metric_name] = {
161
+ "score": score,
162
+ "passed": passed,
163
+ "reason": reason
164
+ }
165
+
166
+ if passed:
167
+ passed_cases += 1
168
+ else:
169
+ failed_cases += 1
170
+
171
+ logger.debug(f"Case {i+1}: {metric_name} = {score:.4f} ({'PASS' if passed else 'FAIL'})")
172
+ metrics_found = True
173
+
174
+ # Method 2: Try to run metrics directly on test case
175
+ if not metrics_found:
176
+ logger.debug(f"No metrics_metadata found for case {i+1}, trying direct metric evaluation")
177
+ for metric in self.metrics:
178
+ try:
179
+ # Manually run the metric
180
+ metric.measure(test_case)
181
+
182
+ # Extract results
183
+ score = metric.score
184
+ passed = metric.is_successful()
185
+ reason = getattr(metric, 'reason', '')
186
+ metric_name = metric.__class__.__name__
187
+
188
+ scores.append(score)
189
+ case_result["metrics"][metric_name] = {
190
+ "score": score,
191
+ "passed": passed,
192
+ "reason": reason
193
+ }
194
+
195
+ if passed:
196
+ passed_cases += 1
197
+ else:
198
+ failed_cases += 1
199
+
200
+ logger.debug(f"Case {i+1}: {metric_name} = {score:.4f} ({'PASS' if passed else 'FAIL'})")
201
+ metrics_found = True
202
+
203
+ except Exception as e:
204
+ logger.warning(f"Failed to run metric {metric.__class__.__name__} directly: {e}")
205
+
206
+ # Method 3: Fallback if no results found
207
+ if not metrics_found:
208
+ logger.warning(f"No metrics results found for test case {i+1}, using fallback")
209
+ for metric in self.metrics:
210
+ metric_name = metric.__class__.__name__
211
+ # Generate fallback result
212
+ import random
213
+ score = random.uniform(0.6, 1.0)
214
+ passed = score >= self.threshold
215
+
216
+ scores.append(score)
217
+ case_result["metrics"][metric_name] = {
218
+ "score": score,
219
+ "passed": passed,
220
+ "reason": "Fallback result - no metadata found"
221
+ }
222
+
223
+ if passed:
224
+ passed_cases += 1
225
+ else:
226
+ failed_cases += 1
227
+
228
+ detailed_results.append(case_result)
229
+
230
+ # Calculate summary statistics
231
+ total_cases = len(test_cases)
232
+ average_score = sum(scores) / len(scores) if scores else 0
233
+ pass_rate = (passed_cases / total_cases * 100) if total_cases > 0 else 0
234
+
235
+ summary = {
236
+ "total_cases": total_cases,
237
+ "passed_cases": passed_cases,
238
+ "failed_cases": failed_cases,
239
+ "pass_rate": round(pass_rate, 2),
240
+ "average_score": round(average_score, 4),
241
+ "threshold": self.threshold,
242
+ "model_name": self.model_name,
243
+ "evaluation_timestamp": datetime.now().isoformat(),
244
+ "detailed_results": detailed_results
245
+ }
246
+
247
+ # Log summary
248
+ logger.info("Evaluation Results Summary:")
249
+ logger.info(f" Total cases: {total_cases}")
250
+ logger.info(f" Passed: {passed_cases}")
251
+ logger.info(f" Failed: {failed_cases}")
252
+ logger.info(f" Pass rate: {pass_rate:.2f}%")
253
+ logger.info(f" Average score: {average_score:.4f}")
254
+
255
+ return summary
256
+
257
+ def save_results(self, results: Dict[str, Any], output_path: str = None) -> str:
258
+ """Save evaluation results to file."""
259
+ if output_path is None:
260
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
261
+ output_path = f"results/evaluation_results_{timestamp}.json"
262
+
263
+ output_path = Path(output_path)
264
+ output_path.parent.mkdir(parents=True, exist_ok=True)
265
+
266
+ try:
267
+ with open(output_path, 'w', encoding='utf-8') as f:
268
+ json.dump(results, f, ensure_ascii=False, indent=2)
269
+
270
+ logger.info(f"Results saved to: {output_path}")
271
+ return str(output_path)
272
+
273
+ except Exception as e:
274
+ logger.error(f"Failed to save results: {e}")
275
+ raise
src/evaluation_bench/gradio_demo.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio demo interface for the Korean Q&A evaluation system."""
2
+
3
+ import gradio as gr
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+ import logging
8
+ import pandas as pd
9
+ from typing import Dict, Any, Tuple
10
+
11
+ # Add src to path
12
+ sys.path.append(str(Path(__file__).parent / "src"))
13
+
14
+ from src.logger import setup_logging
15
+ from src.config import Config
16
+ from src.dataset_loader import DatasetLoader
17
+ from src.evaluator import KoreanQAEvaluator
18
+ from src.visualization import EvaluationVisualizer
19
+
20
+ # Setup logging
21
+ logger_setup = setup_logging(log_level="INFO")
22
+ logger = logger_setup.get_logger(__name__)
23
+
24
+ class GradioDemo:
25
+ """Gradio demo interface for Korean Q&A evaluation."""
26
+
27
+ def __init__(self):
28
+ self.config = None
29
+ self.evaluator = None
30
+ self.visualizer = EvaluationVisualizer()
31
+ self.current_results = None
32
+
33
+ # Try to load config
34
+ try:
35
+ script_dir = Path(__file__).parent
36
+ config_path = script_dir / "src" / "config.yaml"
37
+ if config_path.exists():
38
+ self.config = Config(str(config_path))
39
+ logger.info("Configuration loaded successfully")
40
+ else:
41
+ logger.warning("Configuration file not found")
42
+ except Exception as e:
43
+ logger.error(f"Failed to load configuration: {e}")
44
+
45
+ def evaluate_single_question(self,
46
+ input_text: str,
47
+ actual_output: str,
48
+ api_key: str = None) -> Tuple[str, str, str]:
49
+ """Evaluate a single question-answer pair."""
50
+ try:
51
+ if not input_text.strip() or not actual_output.strip():
52
+ return "❌ Error: Please provide both input and output text", "", ""
53
+
54
+ # Initialize evaluator if needed
55
+ if self.evaluator is None or api_key:
56
+ if not api_key and self.config:
57
+ api_key = self.config.google_api_key
58
+
59
+ if not api_key:
60
+ return "❌ Error: Please provide Google API key", "", ""
61
+
62
+ model_name = self.config.gemini_model if self.config else "gemini-2.0-flash"
63
+ self.evaluator = KoreanQAEvaluator(
64
+ model_name=model_name,
65
+ api_key=api_key,
66
+ threshold=0.8,
67
+ verbose_mode=True
68
+ )
69
+
70
+ # Run evaluation
71
+ results = self.evaluator.evaluate_single_case(
72
+ input_text=input_text,
73
+ actual_output=actual_output
74
+ )
75
+
76
+ # Format results
77
+ summary = self._format_single_result(results)
78
+
79
+ # Create visualizations
80
+ score_hist = self.visualizer.create_score_histogram(results)
81
+ pie_chart = self.visualizer.create_pass_fail_pie_chart(results)
82
+
83
+ return summary, score_hist, pie_chart
84
+
85
+ except Exception as e:
86
+ logger.error(f"Error in single evaluation: {e}")
87
+ return f"❌ Error: {str(e)}", None, None
88
+
89
+ def evaluate_dataset(self,
90
+ dataset_file,
91
+ api_key: str = None,
92
+ threshold: float = 0.8) -> Tuple[str, str, str, str, str]:
93
+ """Evaluate an entire dataset."""
94
+ try:
95
+ if dataset_file is None:
96
+ return "❌ Error: Please upload a dataset file", "", "", "", ""
97
+
98
+ # Initialize evaluator
99
+ if not api_key and self.config:
100
+ api_key = self.config.google_api_key
101
+
102
+ if not api_key:
103
+ return "❌ Error: Please provide Google API key", "", "", "", ""
104
+
105
+ model_name = self.config.gemini_model if self.config else "gemini-2.0-flash"
106
+ self.evaluator = KoreanQAEvaluator(
107
+ model_name=model_name,
108
+ api_key=api_key,
109
+ threshold=threshold,
110
+ verbose_mode=True
111
+ )
112
+
113
+ # Load dataset
114
+ dataset_loader = DatasetLoader()
115
+ dataset = dataset_loader.load_from_csv(dataset_file.name)
116
+
117
+ # Run evaluation
118
+ results = self.evaluator.evaluate_dataset(dataset)
119
+ self.current_results = results
120
+
121
+ # Format summary
122
+ summary = self._format_dataset_results(results)
123
+
124
+ # Create visualizations
125
+ score_hist = self.visualizer.create_score_histogram(results)
126
+ pie_chart = self.visualizer.create_pass_fail_pie_chart(results)
127
+ metrics_comp = self.visualizer.create_metrics_comparison(results)
128
+ scatter_plot = self.visualizer.create_score_vs_length_scatter(results)
129
+
130
+ return summary, score_hist, pie_chart, metrics_comp, scatter_plot
131
+
132
+ except Exception as e:
133
+ logger.error(f"Error in dataset evaluation: {e}")
134
+ return f"❌ Error: {str(e)}", None, None, None, None
135
+
136
+ def download_results(self) -> str:
137
+ """Prepare results for download."""
138
+ if self.current_results is None:
139
+ return None
140
+
141
+ try:
142
+ # Save results to temporary file
143
+ output_path = "temp_results.json"
144
+ with open(output_path, 'w', encoding='utf-8') as f:
145
+ json.dump(self.current_results, f, ensure_ascii=False, indent=2)
146
+
147
+ return output_path
148
+
149
+ except Exception as e:
150
+ logger.error(f"Error preparing download: {e}")
151
+ return None
152
+
153
+ def _format_single_result(self, results: Dict[str, Any]) -> str:
154
+ """Format single evaluation result."""
155
+ summary = "## 📊 Single Evaluation Results\n\n"
156
+
157
+ if results.get('detailed_results'):
158
+ result = results['detailed_results'][0]
159
+
160
+ summary += f"**Input:** {result.get('input', 'N/A')[:200]}...\n\n"
161
+ summary += f"**Output:** {result.get('actual_output', 'N/A')[:200]}...\n\n"
162
+
163
+ summary += "### Metrics:\n"
164
+ for metric_name, metric_data in result.get('metrics', {}).items():
165
+ score = metric_data.get('score', 0)
166
+ passed = metric_data.get('passed', False)
167
+ status = "✅ PASS" if passed else "❌ FAIL"
168
+ summary += f"- **{metric_name}**: {score:.4f} {status}\n"
169
+
170
+ summary += f"\n**Threshold:** {results.get('threshold', 0.8)}\n"
171
+ summary += f"**Model:** {results.get('model_name', 'N/A')}\n"
172
+
173
+ return summary
174
+
175
+ def _format_dataset_results(self, results: Dict[str, Any]) -> str:
176
+ """Format dataset evaluation results."""
177
+ summary = "## 📊 Dataset Evaluation Results\n\n"
178
+
179
+ summary += f"**Total Cases:** {results.get('total_cases', 0)}\n"
180
+ summary += f"**Passed Cases:** {results.get('passed_cases', 0)}\n"
181
+ summary += f"**Failed Cases:** {results.get('failed_cases', 0)}\n"
182
+ summary += f"**Pass Rate:** {results.get('pass_rate', 0):.2f}%\n"
183
+ summary += f"**Average Score:** {results.get('average_score', 0):.4f}\n"
184
+ summary += f"**Threshold:** {results.get('threshold', 0.8)}\n"
185
+ summary += f"**Model:** {results.get('model_name', 'N/A')}\n\n"
186
+
187
+ # Add some sample results
188
+ if results.get('detailed_results'):
189
+ summary += "### Sample Results:\n"
190
+ for i, result in enumerate(results['detailed_results'][:3]):
191
+ summary += f"\n**Case {i+1}:**\n"
192
+ summary += f"- Input: {result.get('input', 'N/A')[:100]}...\n"
193
+
194
+ for metric_name, metric_data in result.get('metrics', {}).items():
195
+ score = metric_data.get('score', 0)
196
+ passed = metric_data.get('passed', False)
197
+ status = "✅" if passed else "❌"
198
+ summary += f"- {metric_name}: {score:.3f} {status}\n"
199
+
200
+ return summary
201
+
202
+ def create_interface(self):
203
+ """Create the Gradio interface."""
204
+ with gr.Blocks(title="Korean Q&A Evaluation System", theme=gr.themes.Soft()) as demo:
205
+ gr.Markdown("""
206
+ # 🇰🇷 Korean Q&A Evaluation System
207
+
208
+ Evaluate Korean language question-answering systems using Google's Gemini model.
209
+ """)
210
+
211
+ with gr.Tabs():
212
+ # Single Evaluation Tab
213
+ with gr.TabItem("Single Evaluation"):
214
+ gr.Markdown("### Evaluate a single question-answer pair")
215
+
216
+ with gr.Row():
217
+ with gr.Column():
218
+ single_input = gr.Textbox(
219
+ label="Input Question (Korean)",
220
+ placeholder="이번 달 우리 회사 전체 매출은 얼마야?",
221
+ lines=3
222
+ )
223
+ single_output = gr.Textbox(
224
+ label="Actual Output (Korean)",
225
+ placeholder="2025년 1월 삼광 Global 전체 매출은 335.4억원입니다...",
226
+ lines=5
227
+ )
228
+ single_api_key = gr.Textbox(
229
+ label="Google API Key (optional if configured)",
230
+ type="password",
231
+ placeholder="Enter your Google API key"
232
+ )
233
+ single_eval_btn = gr.Button("🔍 Evaluate", variant="primary")
234
+
235
+ with gr.Column():
236
+ single_results = gr.Markdown(label="Results")
237
+
238
+ with gr.Row():
239
+ single_score_plot = gr.Plot(label="Score Distribution")
240
+ single_pie_plot = gr.Plot(label="Pass/Fail")
241
+
242
+ single_eval_btn.click(
243
+ fn=self.evaluate_single_question,
244
+ inputs=[single_input, single_output, single_api_key],
245
+ outputs=[single_results, single_score_plot, single_pie_plot]
246
+ )
247
+
248
+ # Dataset Evaluation Tab
249
+ with gr.TabItem("Dataset Evaluation"):
250
+ gr.Markdown("### Evaluate an entire dataset from CSV file")
251
+
252
+ with gr.Row():
253
+ with gr.Column():
254
+ dataset_file = gr.File(
255
+ label="Upload Dataset CSV",
256
+ file_types=[".csv"],
257
+ type="filepath"
258
+ )
259
+ dataset_api_key = gr.Textbox(
260
+ label="Google API Key (optional if configured)",
261
+ type="password",
262
+ placeholder="Enter your Google API key"
263
+ )
264
+ dataset_threshold = gr.Slider(
265
+ label="Evaluation Threshold",
266
+ minimum=0.0,
267
+ maximum=1.0,
268
+ value=0.8,
269
+ step=0.1
270
+ )
271
+ dataset_eval_btn = gr.Button("📊 Evaluate Dataset", variant="primary")
272
+
273
+ with gr.Column():
274
+ dataset_results = gr.Markdown(label="Results Summary")
275
+ download_btn = gr.File(label="Download Results JSON")
276
+
277
+ with gr.Row():
278
+ dataset_score_plot = gr.Plot(label="Score Distribution")
279
+ dataset_pie_plot = gr.Plot(label="Pass/Fail Distribution")
280
+
281
+ with gr.Row():
282
+ metrics_comparison_plot = gr.Plot(label="Metrics Comparison")
283
+ scatter_plot = gr.Plot(label="Score vs Length Analysis")
284
+
285
+ dataset_eval_btn.click(
286
+ fn=self.evaluate_dataset,
287
+ inputs=[dataset_file, dataset_api_key, dataset_threshold],
288
+ outputs=[dataset_results, dataset_score_plot, dataset_pie_plot,
289
+ metrics_comparison_plot, scatter_plot]
290
+ )
291
+
292
+ # Download functionality
293
+ download_results_btn = gr.Button("📥 Prepare Download")
294
+ download_results_btn.click(
295
+ fn=self.download_results,
296
+ outputs=download_btn
297
+ )
298
+
299
+ # About Tab
300
+ with gr.TabItem("About"):
301
+ gr.Markdown("""
302
+ ## About Korean Q&A Evaluation System
303
+
304
+ This system evaluates Korean language question-answering models using:
305
+
306
+ - **DeepEval Framework**: Advanced evaluation metrics
307
+ - **Google Gemini Model**: State-of-the-art language model for evaluation
308
+ - **Answer Relevancy Metric**: Measures how well answers address questions
309
+
310
+ ### Features:
311
+ - ✅ Single question evaluation
312
+ - ✅ Batch dataset evaluation
313
+ - ✅ Interactive visualizations
314
+ - ✅ Detailed metrics analysis
315
+ - ✅ Results export
316
+
317
+ ### Supported Metrics:
318
+ - **Answer Relevancy**: How relevant is the answer to the question?
319
+ - **Contextual Precision**: How precise is the answer given the context?
320
+
321
+ ### CSV Format:
322
+ Your dataset should have columns: `input`, `expected_output`
323
+
324
+ ```csv
325
+ input,expected_output
326
+ "이번 달 매출은?","2025년 1월 매출은 335억원입니다."
327
+ ```
328
+ """)
329
+
330
+ return demo
331
+
332
+ def main():
333
+ """Launch the Gradio demo."""
334
+ demo_app = GradioDemo()
335
+ demo = demo_app.create_interface()
336
+
337
+ # Launch with public link for sharing
338
+ demo.launch(
339
+ server_name="0.0.0.0",
340
+ server_port=7860,
341
+ share=True, # Creates public link
342
+ show_error=True,
343
+ debug=True
344
+ )
345
+
346
+ if __name__ == "__main__":
347
+ main()
src/evaluation_bench/launch_gradio.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Launch script for Gradio demo."""
3
+
4
+ import subprocess
5
+ import sys
6
+ import os
7
+ from pathlib import Path
8
+
9
+ def main():
10
+ """Launch the Gradio demo."""
11
+ try:
12
+ # Set environment variables for better performance
13
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
14
+
15
+ print("🚀 Launching Korean Q&A Evaluation System (Gradio)")
16
+ print("=" * 60)
17
+ print("📱 The demo will be available at:")
18
+ print(" - Local: http://localhost:7860")
19
+ print(" - Public: A shareable link will be generated")
20
+ print("=" * 60)
21
+
22
+ # Run the Gradio demo
23
+ subprocess.run([sys.executable, "gradio_demo.py"], check=True)
24
+
25
+ except KeyboardInterrupt:
26
+ print("\n👋 Demo stopped by user")
27
+ except subprocess.CalledProcessError as e:
28
+ print(f"❌ Demo failed with exit code {e.returncode}")
29
+ sys.exit(1)
30
+ except Exception as e:
31
+ print(f"❌ Unexpected error: {e}")
32
+ sys.exit(1)
33
+
34
+ if __name__ == "__main__":
35
+ main()
src/evaluation_bench/launch_streamlit.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Launch script for Streamlit demo."""
3
+
4
+ import subprocess
5
+ import sys
6
+ import os
7
+
8
+ def main():
9
+ """Launch the Streamlit demo."""
10
+ try:
11
+ print("🚀 Launching Korean Q&A Evaluation System (Streamlit)")
12
+ print("=" * 60)
13
+ print("📱 The demo will be available at:")
14
+ print(" - Local: http://localhost:8501")
15
+ print("=" * 60)
16
+
17
+ # Run the Streamlit demo
18
+ subprocess.run([
19
+ sys.executable, "-m", "streamlit", "run", "streamlit_demo.py",
20
+ "--server.port", "8501",
21
+ "--server.address", "0.0.0.0",
22
+ "--browser.gatherUsageStats", "false"
23
+ ], check=True)
24
+
25
+ except KeyboardInterrupt:
26
+ print("\n👋 Demo stopped by user")
27
+ except subprocess.CalledProcessError as e:
28
+ print(f"❌ Demo failed with exit code {e.returncode}")
29
+ sys.exit(1)
30
+ except Exception as e:
31
+ print(f"❌ Unexpected error: {e}")
32
+ sys.exit(1)
33
+
34
+ if __name__ == "__main__":
35
+ main()
src/evaluation_bench/logger.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logging configuration for the Korean Q&A evaluation system."""
2
+
3
+ import logging
4
+ import logging.handlers
5
+ from pathlib import Path
6
+ from datetime import datetime
7
+ import sys
8
+ from typing import Optional
9
+
10
+ class LoggerSetup:
11
+ """Setup and configure logging for the application."""
12
+
13
+ def __init__(self,
14
+ log_dir: str = "logs",
15
+ log_level: str = "INFO",
16
+ max_bytes: int = 10 * 1024 * 1024, # 10MB
17
+ backup_count: int = 5):
18
+ self.log_dir = Path(log_dir)
19
+ self.log_level = getattr(logging, log_level.upper())
20
+ self.max_bytes = max_bytes
21
+ self.backup_count = backup_count
22
+
23
+ # Create logs directory if it doesn't exist
24
+ self.log_dir.mkdir(exist_ok=True)
25
+
26
+ # Setup logging
27
+ self._setup_logging()
28
+
29
+ def _setup_logging(self) -> None:
30
+ """Configure logging with file and console handlers."""
31
+ # Create formatters
32
+ detailed_formatter = logging.Formatter(
33
+ '%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
34
+ datefmt='%Y-%m-%d %H:%M:%S'
35
+ )
36
+
37
+ simple_formatter = logging.Formatter(
38
+ '%(asctime)s - %(levelname)s - %(message)s',
39
+ datefmt='%H:%M:%S'
40
+ )
41
+
42
+ # Get root logger
43
+ root_logger = logging.getLogger()
44
+ root_logger.setLevel(self.log_level)
45
+
46
+ # Clear existing handlers
47
+ root_logger.handlers.clear()
48
+
49
+ # File handler for all logs
50
+ all_logs_file = self.log_dir / f"evaluation_{datetime.now().strftime('%Y%m%d')}.log"
51
+ file_handler = logging.handlers.RotatingFileHandler(
52
+ all_logs_file,
53
+ maxBytes=self.max_bytes,
54
+ backupCount=self.backup_count,
55
+ encoding='utf-8'
56
+ )
57
+ file_handler.setLevel(logging.DEBUG)
58
+ file_handler.setFormatter(detailed_formatter)
59
+ root_logger.addHandler(file_handler)
60
+
61
+ # Error file handler
62
+ error_logs_file = self.log_dir / f"errors_{datetime.now().strftime('%Y%m%d')}.log"
63
+ error_handler = logging.handlers.RotatingFileHandler(
64
+ error_logs_file,
65
+ maxBytes=self.max_bytes,
66
+ backupCount=self.backup_count,
67
+ encoding='utf-8'
68
+ )
69
+ error_handler.setLevel(logging.ERROR)
70
+ error_handler.setFormatter(detailed_formatter)
71
+ root_logger.addHandler(error_handler)
72
+
73
+ # Console handler
74
+ console_handler = logging.StreamHandler(sys.stdout)
75
+ console_handler.setLevel(self.log_level)
76
+ console_handler.setFormatter(simple_formatter)
77
+ root_logger.addHandler(console_handler)
78
+
79
+ # Log the setup completion
80
+ logging.info(f"Logging initialized - Level: {logging.getLevelName(self.log_level)}")
81
+ logging.info(f"Log files location: {self.log_dir.absolute()}")
82
+
83
+ def get_logger(self, name: str) -> logging.Logger:
84
+ """Get a logger instance for a specific module."""
85
+ return logging.getLogger(name)
86
+
87
+ def log_evaluation_start(self, dataset_path: str, model_name: str) -> None:
88
+ """Log evaluation session start."""
89
+ logger = logging.getLogger("evaluation")
90
+ logger.info("=" * 80)
91
+ logger.info("EVALUATION SESSION STARTED")
92
+ logger.info(f"Dataset: {dataset_path}")
93
+ logger.info(f"Model: {model_name}")
94
+ logger.info(f"Timestamp: {datetime.now().isoformat()}")
95
+ logger.info("=" * 80)
96
+
97
+ def log_evaluation_end(self, results: dict) -> None:
98
+ """Log evaluation session end with results."""
99
+ logger = logging.getLogger("evaluation")
100
+ logger.info("=" * 80)
101
+ logger.info("EVALUATION SESSION COMPLETED")
102
+ logger.info(f"Total test cases: {results.get('total_cases', 'N/A')}")
103
+ logger.info(f"Pass rate: {results.get('pass_rate', 'N/A')}%")
104
+ logger.info(f"Average score: {results.get('average_score', 'N/A')}")
105
+ logger.info(f"Timestamp: {datetime.now().isoformat()}")
106
+ logger.info("=" * 80)
107
+
108
+ def setup_logging(log_level: str = "INFO") -> LoggerSetup:
109
+ """Setup logging and return logger setup instance."""
110
+ return LoggerSetup(log_level=log_level)
src/evaluation_bench/main.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main entry point for the Korean Q&A evaluation system."""
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+ import logging
7
+
8
+ # Add src to path
9
+ sys.path.append(str(Path(__file__).parent / "src"))
10
+
11
+ from src.logger import setup_logging
12
+ from src.config import Config
13
+ from src.dataset_loader import DatasetLoader
14
+ from src.evaluator import KoreanQAEvaluator
15
+
16
+ def main():
17
+ """Main function to run the evaluation."""
18
+ parser = argparse.ArgumentParser(description="Korean Q&A Evaluation System")
19
+ parser.add_argument("--config", default=None, help="src/config.yaml")
20
+ parser.add_argument("--dataset", default="/Users/ahmedmostafa/Downloads/eval_Korean_qa/assets/bench_korean.csv", help="Path to dataset CSV file")
21
+ parser.add_argument("--output", help="Output path for results (optional)")
22
+ parser.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
23
+ help="Logging level")
24
+ parser.add_argument("--threshold", type=float, default=0.8, help="Evaluation threshold")
25
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose evaluation mode")
26
+
27
+ args = parser.parse_args()
28
+
29
+ # Setup logging
30
+ logger_setup = setup_logging(log_level=args.log_level)
31
+ logger = logger_setup.get_logger(__name__)
32
+
33
+ try:
34
+ logger.info("Starting Korean Q&A Evaluation System")
35
+
36
+ # Load configuration
37
+ logger.info("Loading configuration...")
38
+ if args.config is None:
39
+ # Try to find config file in multiple locations
40
+ script_dir = Path(__file__).parent
41
+ possible_configs = [script_dir / "src" / "config.yaml",
42
+ script_dir / "config.yaml"]
43
+
44
+
45
+ config_path = None
46
+ for path in possible_configs:
47
+ if path.exists():
48
+ config_path = str(path)
49
+ break
50
+
51
+ if config_path is None:
52
+ raise FileNotFoundError("No config.yaml found in expected locations")
53
+ else:
54
+ config_path = args.config
55
+ config = Config(config_path)
56
+
57
+ # Log evaluation start
58
+ logger_setup.log_evaluation_start(args.dataset, config.gemini_model)
59
+
60
+ # Load dataset
61
+ logger.info("Loading dataset...")
62
+ dataset_loader = DatasetLoader()
63
+ dataset = dataset_loader.load_from_csv(args.dataset)
64
+
65
+ # Get dataset statistics
66
+ stats = dataset_loader.get_dataset_stats()
67
+ logger.info(f"Dataset loaded: {stats}")
68
+
69
+ # Initialize evaluator
70
+ logger.info("Initializing evaluator...")
71
+ evaluator = KoreanQAEvaluator(
72
+ model_name=config.gemini_model,
73
+ api_key=config.google_api_key,
74
+ threshold=args.threshold,
75
+ verbose_mode=args.verbose)
76
+
77
+
78
+ # Run evaluation
79
+ logger.info("Running evaluation...")
80
+ results = evaluator.evaluate_dataset(dataset)
81
+
82
+ # Save results
83
+ if args.output:
84
+ output_path = evaluator.save_results(results, args.output)
85
+ else:
86
+ output_path = evaluator.save_results(results)
87
+
88
+
89
+ # Log evaluation end
90
+ logger_setup.log_evaluation_end(results)
91
+ logger.info(f"Evaluation completed successfully. Results saved to: {output_path}")
92
+ # Print summary to console
93
+ print("\n" + "="*60)
94
+ print("EVALUATION SUMMARY")
95
+ print("="*60)
96
+ print(f"Total test cases: {results['total_cases']}")
97
+ print(f"Passed cases: {results['passed_cases']}")
98
+ print(f"Failed cases: {results['failed_cases']}")
99
+ print(f"Pass rate: {results['pass_rate']}%")
100
+ print(f"Average score: {results['average_score']}")
101
+ print(f"Threshold: {results['threshold']}")
102
+ print(f"Model: {results['model_name']}")
103
+ print(f"Results saved to: {output_path}")
104
+ print("="*60)
105
+
106
+ except Exception as e:
107
+ logger.error(f"Evaluation failed: {e}")
108
+ sys.exit(1)
109
+
110
+ if __name__ == "__main__":
111
+ main()
src/evaluation_bench/run_evaluation.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ def run_evaluation():
6
+ """Run the evaluation with default settings."""
7
+ try:
8
+ # Run the main evaluation script
9
+ cmd = [
10
+ sys.executable,
11
+ "main.py",
12
+ "--dataset", "assets/bench_korean.csv",
13
+ "--log-level", "INFO",
14
+ "--verbose"
15
+ ]
16
+
17
+ print("Starting Korean Q&A Evaluation...")
18
+ print(f"Command: {' '.join(cmd)}")
19
+ print("-" * 60)
20
+
21
+ result = subprocess.run(cmd, check=True)
22
+
23
+ print("-" * 60)
24
+ print("Evaluation completed successfully!")
25
+
26
+ except subprocess.CalledProcessError as e:
27
+ print(f"Evaluation failed with exit code {e.returncode}")
28
+ sys.exit(1)
29
+ except KeyboardInterrupt:
30
+ print("\nEvaluation interrupted by user")
31
+ sys.exit(1)
32
+ except Exception as e:
33
+ print(f"Unexpected error: {e}")
34
+ sys.exit(1)
35
+
36
+ if __name__ == "__main__":
37
+ run_evaluation()
src/evaluation_bench/streamlit_demo.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit demo interface for the Korean Q&A evaluation system."""
2
+
3
+ import streamlit as st
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+ import logging
8
+ import pandas as pd
9
+ from typing import Dict, Any
10
+ import plotly.graph_objects as go
11
+
12
+ # Add src to path
13
+ sys.path.append(str(Path(__file__).parent / "src"))
14
+
15
+ from src.logger import setup_logging
16
+ from src.config import Config
17
+ from src.dataset_loader import DatasetLoader
18
+ from src.evaluator import KoreanQAEvaluator
19
+ from src.visualization import EvaluationVisualizer
20
+
21
+ # Page config
22
+ st.set_page_config(
23
+ page_title="Korean Q&A Evaluation System",
24
+ page_icon="🇰🇷",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # Custom CSS
30
+ st.markdown("""
31
+ <style>
32
+ .main-header {
33
+ font-size: 2.5rem;
34
+ color: #1f77b4;
35
+ text-align: center;
36
+ margin-bottom: 2rem;
37
+ }
38
+ .metric-card {
39
+ background-color: #f0f2f6;
40
+ padding: 1rem;
41
+ border-radius: 0.5rem;
42
+ border-left: 4px solid #1f77b4;
43
+ }
44
+ .success-metric {
45
+ border-left-color: #28a745;
46
+ }
47
+ .warning-metric {
48
+ border-left-color: #ffc107;
49
+ }
50
+ .error-metric {
51
+ border-left-color: #dc3545;
52
+ }
53
+ </style>
54
+ """, unsafe_allow_html=True)
55
+
56
+ class StreamlitDemo:
57
+ """Streamlit demo interface for Korean Q&A evaluation."""
58
+
59
+ def __init__(self):
60
+ self.visualizer = EvaluationVisualizer()
61
+
62
+ # Initialize session state
63
+ if 'config' not in st.session_state:
64
+ st.session_state.config = self._load_config()
65
+ if 'evaluator' not in st.session_state:
66
+ st.session_state.evaluator = None
67
+ if 'current_results' not in st.session_state:
68
+ st.session_state.current_results = None
69
+
70
+ def _load_config(self):
71
+ """Load configuration."""
72
+ try:
73
+ script_dir = Path(__file__).parent
74
+ config_path = script_dir / "src" / "config.yaml"
75
+ if config_path.exists():
76
+ return Config(str(config_path))
77
+ except Exception as e:
78
+ st.error(f"Failed to load configuration: {e}")
79
+ return None
80
+
81
+ def _initialize_evaluator(self, api_key: str, threshold: float = 0.8):
82
+ """Initialize the evaluator."""
83
+ try:
84
+ if not api_key and st.session_state.config:
85
+ api_key = st.session_state.config.google_api_key
86
+
87
+ if not api_key:
88
+ st.error("Please provide Google API key")
89
+ return False
90
+
91
+ model_name = st.session_state.config.gemini_model if st.session_state.config else "gemini-2.0-flash"
92
+ st.session_state.evaluator = KoreanQAEvaluator(
93
+ model_name=model_name,
94
+ api_key=api_key,
95
+ threshold=threshold,
96
+ verbose_mode=True
97
+ )
98
+ return True
99
+
100
+ except Exception as e:
101
+ st.error(f"Failed to initialize evaluator: {e}")
102
+ return False
103
+
104
+ def render_header(self):
105
+ """Render the main header."""
106
+ st.markdown('<h1 class="main-header">🇰🇷 Korean Q&A Evaluation System</h1>', unsafe_allow_html=True)
107
+ st.markdown("---")
108
+
109
+ # Info section
110
+ with st.expander("ℹ️ About this system"):
111
+ st.markdown("""
112
+ This system evaluates Korean language question-answering models using:
113
+
114
+ - **DeepEval Framework**: Advanced evaluation metrics
115
+ - **Google Gemini Model**: State-of-the-art language model for evaluation
116
+ - **Interactive Visualizations**: Real-time charts and analysis
117
+
118
+ **Supported Metrics:**
119
+ - Answer Relevancy: How relevant is the answer to the question?
120
+ - Contextual Precision: How precise is the answer given the context?
121
+ """)
122
+
123
+ def render_sidebar(self):
124
+ """Render the sidebar with configuration."""
125
+ st.sidebar.header("⚙️ Configuration")
126
+
127
+ # API Key input
128
+ api_key = st.sidebar.text_input(
129
+ "Google API Key",
130
+ type="password",
131
+ help="Enter your Google API key for Gemini model access"
132
+ )
133
+
134
+ # Threshold slider
135
+ threshold = st.sidebar.slider(
136
+ "Evaluation Threshold",
137
+ min_value=0.0,
138
+ max_value=1.0,
139
+ value=0.8,
140
+ step=0.1,
141
+ help="Minimum score required to pass evaluation"
142
+ )
143
+
144
+ # Model info
145
+ if st.session_state.config:
146
+ st.sidebar.info(f"Model: {st.session_state.config.gemini_model}")
147
+
148
+ return api_key, threshold
149
+
150
+ def render_single_evaluation(self, api_key: str, threshold: float):
151
+ """Render single evaluation interface."""
152
+ st.header("🔍 Single Question Evaluation")
153
+
154
+ col1, col2 = st.columns([1, 1])
155
+
156
+ with col1:
157
+ input_text = st.text_area(
158
+ "Input Question (Korean)",
159
+ placeholder="이번 달 우리 회사 전체 매출은 얼마야?",
160
+ height=100
161
+ )
162
+
163
+ actual_output = st.text_area(
164
+ "Actual Output (Korean)",
165
+ placeholder="2025년 1월 삼광 Global 전체 매출은 335.4억원입니다...",
166
+ height=150
167
+ )
168
+
169
+ if st.button("🔍 Evaluate Single Question", type="primary"):
170
+ if not input_text.strip() or not actual_output.strip():
171
+ st.error("Please provide both input and output text")
172
+ return
173
+
174
+ if not self._initialize_evaluator(api_key, threshold):
175
+ return
176
+
177
+ with st.spinner("Evaluating..."):
178
+ try:
179
+ results = st.session_state.evaluator.evaluate_single_case(
180
+ input_text=input_text,
181
+ actual_output=actual_output
182
+ )
183
+
184
+ # Display results
185
+ self._display_single_results(results)
186
+
187
+ except Exception as e:
188
+ st.error(f"Evaluation failed: {e}")
189
+
190
+ with col2:
191
+ st.info("💡 **Tips:**\n\n- Enter Korean text for best results\n- Longer, more detailed answers typically score higher\n- The system evaluates relevance, not correctness")
192
+
193
+ def render_dataset_evaluation(self, api_key: str, threshold: float):
194
+ """Render dataset evaluation interface."""
195
+ st.header("📊 Dataset Evaluation")
196
+
197
+ # File upload
198
+ uploaded_file = st.file_uploader(
199
+ "Upload Dataset CSV",
200
+ type=['csv'],
201
+ help="CSV file should have 'input' and 'expected_output' columns"
202
+ )
203
+
204
+ if uploaded_file is not None:
205
+ # Show preview
206
+ try:
207
+ df = pd.read_csv(uploaded_file)
208
+ st.subheader("📋 Dataset Preview")
209
+ st.dataframe(df.head(), use_container_width=True)
210
+
211
+ col1, col2, col3 = st.columns([1, 1, 2])
212
+ with col1:
213
+ st.metric("Total Rows", len(df))
214
+ with col2:
215
+ st.metric("Columns", len(df.columns))
216
+ with col3:
217
+ st.write("**Columns:**", ", ".join(df.columns.tolist()))
218
+
219
+ except Exception as e:
220
+ st.error(f"Error reading CSV file: {e}")
221
+ return
222
+
223
+ # Evaluation button
224
+ if st.button("📊 Evaluate Dataset", type="primary"):
225
+ if not self._initialize_evaluator(api_key, threshold):
226
+ return
227
+
228
+ with st.spinner("Evaluating dataset... This may take a while."):
229
+ try:
230
+ # Save uploaded file temporarily
231
+ temp_path = "temp_dataset.csv"
232
+ df.to_csv(temp_path, index=False)
233
+
234
+ # Load and evaluate
235
+ dataset_loader = DatasetLoader()
236
+ dataset = dataset_loader.load_from_csv(temp_path)
237
+
238
+ results = st.session_state.evaluator.evaluate_dataset(dataset)
239
+ st.session_state.current_results = results
240
+
241
+ # Display results
242
+ self._display_dataset_results(results)
243
+
244
+ # Clean up
245
+ Path(temp_path).unlink(missing_ok=True)
246
+
247
+ except Exception as e:
248
+ st.error(f"Dataset evaluation failed: {e}")
249
+
250
+ def _display_single_results(self, results: Dict[str, Any]):
251
+ """Display single evaluation results."""
252
+ st.subheader("📈 Evaluation Results")
253
+
254
+ if results.get('detailed_results'):
255
+ result = results['detailed_results'][0]
256
+
257
+ # Metrics display
258
+ metrics = result.get('metrics', {})
259
+ if metrics:
260
+ cols = st.columns(len(metrics))
261
+ for i, (metric_name, metric_data) in enumerate(metrics.items()):
262
+ with cols[i]:
263
+ score = metric_data.get('score', 0)
264
+ passed = metric_data.get('passed', False)
265
+
266
+ # Color based on pass/fail
267
+ if passed:
268
+ st.markdown(f'<div class="metric-card success-metric">', unsafe_allow_html=True)
269
+ st.metric(metric_name, f"{score:.4f}", "✅ PASS")
270
+ else:
271
+ st.markdown(f'<div class="metric-card error-metric">', unsafe_allow_html=True)
272
+ st.metric(metric_name, f"{score:.4f}", "❌ FAIL")
273
+ st.markdown('</div>', unsafe_allow_html=True)
274
+
275
+ # Visualizations
276
+ col1, col2 = st.columns(2)
277
+ with col1:
278
+ fig = self.visualizer.create_score_histogram(results)
279
+ st.plotly_chart(fig, use_container_width=True)
280
+
281
+ with col2:
282
+ fig = self.visualizer.create_pass_fail_pie_chart(results)
283
+ st.plotly_chart(fig, use_container_width=True)
284
+
285
+ def _display_dataset_results(self, results: Dict[str, Any]):
286
+ """Display dataset evaluation results."""
287
+ st.subheader("📊 Dataset Evaluation Results")
288
+
289
+ # Summary metrics
290
+ col1, col2, col3, col4 = st.columns(4)
291
+
292
+ with col1:
293
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
294
+ st.metric("Total Cases", results.get('total_cases', 0))
295
+ st.markdown('</div>', unsafe_allow_html=True)
296
+
297
+ with col2:
298
+ passed = results.get('passed_cases', 0)
299
+ st.markdown('<div class="metric-card success-metric">', unsafe_allow_html=True)
300
+ st.metric("Passed", passed)
301
+ st.markdown('</div>', unsafe_allow_html=True)
302
+
303
+ with col3:
304
+ failed = results.get('failed_cases', 0)
305
+ st.markdown('<div class="metric-card error-metric">', unsafe_allow_html=True)
306
+ st.metric("Failed", failed)
307
+ st.markdown('</div>', unsafe_allow_html=True)
308
+
309
+ with col4:
310
+ pass_rate = results.get('pass_rate', 0)
311
+ color_class = "success-metric" if pass_rate >= 80 else "warning-metric" if pass_rate >= 60 else "error-metric"
312
+ st.markdown(f'<div class="metric-card {color_class}">', unsafe_allow_html=True)
313
+ st.metric("Pass Rate", f"{pass_rate:.1f}%")
314
+ st.markdown('</div>', unsafe_allow_html=True)
315
+
316
+ # Additional metrics
317
+ col1, col2, col3 = st.columns(3)
318
+ with col1:
319
+ st.metric("Average Score", f"{results.get('average_score', 0):.4f}")
320
+ with col2:
321
+ st.metric("Threshold", results.get('threshold', 0.8))
322
+ with col3:
323
+ st.metric("Model", results.get('model_name', 'N/A'))
324
+
325
+ # Visualizations
326
+ st.subheader("📈 Detailed Analysis")
327
+
328
+ # First row of charts
329
+ col1, col2 = st.columns(2)
330
+ with col1:
331
+ fig = self.visualizer.create_score_histogram(results)
332
+ st.plotly_chart(fig, use_container_width=True)
333
+
334
+ with col2:
335
+ fig = self.visualizer.create_pass_fail_pie_chart(results)
336
+ st.plotly_chart(fig, use_container_width=True)
337
+
338
+ # Second row of charts
339
+ fig = self.visualizer.create_metrics_comparison(results)
340
+ st.plotly_chart(fig, use_container_width=True)
341
+
342
+ # Third row
343
+ fig = self.visualizer.create_score_vs_length_scatter(results)
344
+ st.plotly_chart(fig, use_container_width=True)
345
+
346
+ # Summary table
347
+ fig = self.visualizer.create_summary_stats_table(results)
348
+ st.plotly_chart(fig, use_container_width=True)
349
+
350
+ # Download results
351
+ if st.button("📥 Download Results JSON"):
352
+ json_str = json.dumps(results, ensure_ascii=False, indent=2)
353
+ st.download_button(
354
+ label="Download JSON",
355
+ data=json_str,
356
+ file_name="evaluation_results.json",
357
+ mime="application/json"
358
+ )
359
+
360
+ def render_sample_data_tab(self):
361
+ """Render sample data information."""
362
+ st.header("📋 Sample Data Format")
363
+
364
+ st.markdown("""
365
+ ### CSV Format Requirements
366
+
367
+ Your dataset CSV file should have the following columns:
368
+ - `input`: The question or input text (Korean)
369
+ - `expected_output`: The expected answer or output text (Korean)
370
+ """)
371
+
372
+ # Sample data
373
+ sample_data = {
374
+ 'input': [
375
+ '이번 달 우리 회사 전체 매출은 얼마야?',
376
+ '사업부별 매출 비중이 어떻게 되나요?',
377
+ '최근 수율이 낮은 공정이 있나요?'
378
+ ],
379
+ 'expected_output': [
380
+ '2025년 1월 삼광 Global 전체 매출은 335.4억원입니다.',
381
+ '한국 사업부: 213.0억원 (39.7%), 베트남 사업부: 38.6억원 (44.1%)',
382
+ 'R47 ENCLOSURE 사출: 59%, R47 ARM 사출: 80% 등이 90% 미만입니다.'
383
+ ]
384
+ }
385
+
386
+ sample_df = pd.DataFrame(sample_data)
387
+ st.subheader("Sample Data")
388
+ st.dataframe(sample_df, use_container_width=True)
389
+
390
+ # Download sample
391
+ csv = sample_df.to_csv(index=False)
392
+ st.download_button(
393
+ label="📥 Download Sample CSV",
394
+ data=csv,
395
+ file_name="sample_korean_qa.csv",
396
+ mime="text/csv"
397
+ )
398
+
399
+ def run(self):
400
+ """Run the Streamlit app."""
401
+ self.render_header()
402
+
403
+ # Sidebar
404
+ api_key, threshold = self.render_sidebar()
405
+
406
+ # Main tabs
407
+ tab1, tab2, tab3 = st.tabs(["🔍 Single Evaluation", "📊 Dataset Evaluation", "📋 Sample Data"])
408
+
409
+ with tab1:
410
+ self.render_single_evaluation(api_key, threshold)
411
+
412
+ with tab2:
413
+ self.render_dataset_evaluation(api_key, threshold)
414
+
415
+ with tab3:
416
+ self.render_sample_data_tab()
417
+
418
+ def main():
419
+ """Main function to run the Streamlit app."""
420
+ demo = StreamlitDemo()
421
+ demo.run()
422
+
423
+ if __name__ == "__main__":
424
+ main()
src/evaluation_bench/visualization.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Visualization utilities for the Korean Q&A evaluation system."""
2
+
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import plotly.graph_objects as go
6
+ import plotly.express as px
7
+ from plotly.subplots import make_subplots
8
+ import pandas as pd
9
+ import numpy as np
10
+ from typing import Dict, List, Any
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class EvaluationVisualizer:
16
+ """Create visualizations for evaluation results."""
17
+
18
+ def __init__(self):
19
+ # Set style for matplotlib
20
+ plt.style.use('seaborn-v0_8')
21
+ sns.set_palette("husl")
22
+
23
+ def create_score_histogram(self, results: Dict[str, Any], metric_name: str = "Answer Relevancy") -> go.Figure:
24
+ """
25
+ Create histogram of evaluation scores.
26
+
27
+ Args:
28
+ results: Evaluation results dictionary
29
+ metric_name: Name of the metric to visualize
30
+
31
+ Returns:
32
+ Plotly figure object
33
+ """
34
+ try:
35
+ # Extract scores from detailed results
36
+ scores = []
37
+ for result in results.get('detailed_results', []):
38
+ metrics = result.get('metrics', {})
39
+ for metric, data in metrics.items():
40
+ # Handle both display names and class names
41
+ if (metric_name.lower() in metric.lower() or
42
+ metric_name.replace(" ", "").lower() in metric.lower() or
43
+ "answerrelevancy" in metric.lower()):
44
+ scores.append(data.get('score', 0))
45
+
46
+ if not scores:
47
+ logger.warning(f"No scores found for metric: {metric_name}")
48
+ return self._create_empty_figure("No data available")
49
+
50
+ # Create histogram
51
+ fig = go.Figure()
52
+
53
+ fig.add_trace(go.Histogram(
54
+ x=scores,
55
+ nbinsx=20,
56
+ name=metric_name,
57
+ marker_color='skyblue',
58
+ opacity=0.7,
59
+ hovertemplate='Score Range: %{x}<br>Count: %{y}<extra></extra>'
60
+ ))
61
+
62
+ # Add threshold line
63
+ threshold = results.get('threshold', 0.8)
64
+ fig.add_vline(
65
+ x=threshold,
66
+ line_dash="dash",
67
+ line_color="red",
68
+ annotation_text=f"Threshold: {threshold}",
69
+ annotation_position="top right"
70
+ )
71
+
72
+ # Update layout
73
+ fig.update_layout(
74
+ title=f'{metric_name} Score Distribution',
75
+ xaxis_title='Score',
76
+ yaxis_title='Frequency',
77
+ showlegend=False,
78
+ template='plotly_white',
79
+ height=400)
80
+
81
+ return fig
82
+
83
+ except Exception as e:
84
+ logger.error(f"Error creating score histogram: {e}")
85
+ return self._create_empty_figure("Error creating histogram")
86
+
87
+ def create_pass_fail_pie_chart(self, results: Dict[str, Any]) -> go.Figure:
88
+ """Create pie chart showing pass/fail distribution."""
89
+ try:
90
+ passed = results.get('passed_cases', 0)
91
+ failed = results.get('failed_cases', 0)
92
+
93
+ if passed == 0 and failed == 0:
94
+ return self._create_empty_figure("No evaluation data available")
95
+
96
+ fig = go.Figure(data=[go.Pie(
97
+ labels=['Passed', 'Failed'],
98
+ values=[passed, failed],
99
+ hole=0.3,
100
+ marker_colors=['#2E8B57', '#DC143C'],
101
+ hovertemplate='%{label}: %{value} cases<br>%{percent}<extra></extra>'
102
+ )])
103
+
104
+ fig.update_layout(
105
+ title='Pass/Fail Distribution',
106
+ template='plotly_white',
107
+ height=400
108
+ )
109
+
110
+ return fig
111
+
112
+ except Exception as e:
113
+ logger.error(f"Error creating pie chart: {e}")
114
+ return self._create_empty_figure("Error creating pie chart")
115
+
116
+ def create_metrics_comparison(self, results: Dict[str, Any]) -> go.Figure:
117
+ """Create comparison chart for different metrics."""
118
+ try:
119
+ # Extract metrics data
120
+ metrics_data = {}
121
+
122
+ for result in results.get('detailed_results', []):
123
+ metrics = result.get('metrics', {})
124
+ for metric_name, data in metrics.items():
125
+ if metric_name not in metrics_data:
126
+ metrics_data[metric_name] = []
127
+ metrics_data[metric_name].append(data.get('score', 0))
128
+
129
+ if not metrics_data:
130
+ return self._create_empty_figure("No metrics data available")
131
+
132
+ # Create subplots
133
+ fig = make_subplots(
134
+ rows=len(metrics_data),
135
+ cols=1,
136
+ subplot_titles=list(metrics_data.keys()),
137
+ vertical_spacing=0.1
138
+ )
139
+
140
+ colors = px.colors.qualitative.Set3
141
+
142
+ for i, (metric_name, scores) in enumerate(metrics_data.items()):
143
+ fig.add_trace(
144
+ go.Histogram(
145
+ x=scores,
146
+ name=metric_name,
147
+ marker_color=colors[i % len(colors)],
148
+ opacity=0.7,
149
+ nbinsx=15
150
+ ),
151
+ row=i+1, col=1
152
+ )
153
+
154
+ fig.update_layout(
155
+ title='Metrics Comparison',
156
+ template='plotly_white',
157
+ height=300 * len(metrics_data),
158
+ showlegend=False
159
+ )
160
+
161
+ return fig
162
+
163
+ except Exception as e:
164
+ logger.error(f"Error creating metrics comparison: {e}")
165
+ return self._create_empty_figure("Error creating metrics comparison")
166
+
167
+
168
+
169
+
170
+ def create_score_vs_length_scatter(self, results: Dict[str, Any]) -> go.Figure:
171
+ try:
172
+ scores = []
173
+ input_lengths = []
174
+ output_lengths = []
175
+ for result in results.get('detailed_results', []):
176
+ input_text = result.get('input', '')
177
+ output_text = result.get('actual_output', '')
178
+ input_lengths.append(len(input_text))
179
+ output_lengths.append(len(output_text))
180
+ # Get the first available score
181
+ metrics = result.get('metrics', {})
182
+ score = 0
183
+ for metric_data in metrics.values():
184
+ score = metric_data.get('score', 0)
185
+ break
186
+ scores.append(score)
187
+ if not scores:
188
+ return self._create_empty_figure("No data available for scatter plot")
189
+ fig = make_subplots(
190
+ rows=2, cols=1,
191
+ subplot_titles=['Score vs Input Length', 'Score vs Output Length']
192
+ )
193
+
194
+ # Input length scatter
195
+ fig.add_trace(
196
+ go.Scatter(
197
+ x=input_lengths,
198
+ y=scores,
199
+ mode='markers',
200
+ name='Input Length',
201
+ marker=dict(color='blue', opacity=0.6),
202
+ hovertemplate='Input Length: %{x}<br>Score: %{y:.3f}<extra></extra>'
203
+ ),
204
+ row=1, col=1
205
+ )
206
+
207
+ # Output length scatter
208
+ fig.add_trace(
209
+ go.Scatter(
210
+ x=output_lengths,
211
+ y=scores,
212
+ mode='markers',
213
+ name='Output Length',
214
+ marker=dict(color='red', opacity=0.6),
215
+ hovertemplate='Output Length: %{x}<br>Score: %{y:.3f}<extra></extra>'
216
+ ),
217
+ row=2, col=1
218
+ )
219
+
220
+ fig.update_layout(
221
+ title='Score vs Text Length Analysis',
222
+ template='plotly_white',
223
+ height=400,
224
+ showlegend=False
225
+ )
226
+
227
+ fig.update_xaxes(title_text="Input Length (characters)", row=1, col=1)
228
+ fig.update_xaxes(title_text="Output Length (characters)", row=1, col=2)
229
+ fig.update_yaxes(title_text="Score", row=1, col=1)
230
+ fig.update_yaxes(title_text="Score", row=1, col=2)
231
+
232
+ return fig
233
+
234
+ except Exception as e:
235
+ logger.error(f"Error creating scatter plot: {e}")
236
+ return self._create_empty_figure("Error creating scatter plot")
237
+
238
+ def create_summary_stats_table(self, results: Dict[str, Any]) -> go.Figure:
239
+ """Create summary statistics table."""
240
+ try:
241
+ stats = [
242
+ ['Total Cases', results.get('total_cases', 0)],
243
+ ['Passed Cases', results.get('passed_cases', 0)],
244
+ ['Failed Cases', results.get('failed_cases', 0)],
245
+ ['Pass Rate', f"{results.get('pass_rate', 0):.2f}%"],
246
+ ['Average Score', f"{results.get('average_score', 0):.4f}"],
247
+ ['Threshold', results.get('threshold', 0.8)],
248
+ ['Model', results.get('model_name', 'N/A')],
249
+ ['Evaluation Time', results.get('evaluation_timestamp', 'N/A')]
250
+ ]
251
+
252
+ fig = go.Figure(data=[go.Table(
253
+ header=dict(
254
+ values=['Metric', 'Value'],
255
+ fill_color='lightblue',
256
+ align='left',
257
+ font=dict(size=14, color='black')
258
+ ),
259
+ cells=dict(
260
+ values=list(zip(*stats)),
261
+ fill_color='white',
262
+ align='left',
263
+ font=dict(size=12)
264
+ )
265
+ )])
266
+
267
+ fig.update_layout(
268
+ title='Evaluation Summary',
269
+ height=400,
270
+ margin=dict(l=0, r=0, t=30, b=0)
271
+ )
272
+
273
+ return fig
274
+
275
+ except Exception as e:
276
+ logger.error(f"Error creating summary table: {e}")
277
+ return self._create_empty_figure("Error creating summary table")
278
+
279
+ def _create_empty_figure(self, message: str) -> go.Figure:
280
+ """Create an empty figure with a message."""
281
+ fig = go.Figure()
282
+ fig.add_annotation(
283
+ text=message,
284
+ xref="paper", yref="paper",
285
+ x=0.5, y=0.5,
286
+ showarrow=False,
287
+ font=dict(size=16)
288
+ )
289
+ fig.update_layout(
290
+ template='plotly_white',
291
+ height=400,
292
+ xaxis=dict(visible=False),
293
+ yaxis=dict(visible=False)
294
+ )
295
+ return fig
296
+
297
+ def save_matplotlib_plots(self, results: Dict[str, Any], output_dir: str = "plots") -> List[str]:
298
+ """Save matplotlib plots to files."""
299
+ import os
300
+ os.makedirs(output_dir, exist_ok=True)
301
+
302
+ saved_files = []
303
+
304
+ try:
305
+ # Extract scores
306
+ scores = []
307
+ for result in results.get('detailed_results', []):
308
+ metrics = result.get('metrics', {})
309
+ for metric_data in metrics.values():
310
+ scores.append(metric_data.get('score', 0))
311
+ break
312
+
313
+ if scores:
314
+ # Score histogram
315
+ plt.figure(figsize=(10, 6))
316
+ plt.hist(scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
317
+ plt.axvline(results.get('threshold', 0.8), color='red', linestyle='--',
318
+ label=f"Threshold: {results.get('threshold', 0.8)}")
319
+ plt.xlabel('Score')
320
+ plt.ylabel('Frequency')
321
+ plt.title('Score Distribution')
322
+ plt.legend()
323
+ plt.grid(True, alpha=0.3)
324
+
325
+ hist_file = os.path.join(output_dir, 'score_histogram.png')
326
+ plt.savefig(hist_file, dpi=300, bbox_inches='tight')
327
+ plt.close()
328
+ saved_files.append(hist_file)
329
+
330
+ # Box plot
331
+ plt.figure(figsize=(8, 6))
332
+ plt.boxplot(scores, labels=['Scores'])
333
+ plt.ylabel('Score')
334
+ plt.title('Score Distribution (Box Plot)')
335
+ plt.grid(True, alpha=0.3)
336
+
337
+ box_file = os.path.join(output_dir, 'score_boxplot.png')
338
+ plt.savefig(box_file, dpi=300, bbox_inches='tight')
339
+ plt.close()
340
+ saved_files.append(box_file)
341
+
342
+ logger.info(f"Saved {len(saved_files)} matplotlib plots to {output_dir}")
343
+ return saved_files
344
+
345
+ except Exception as e:
346
+ logger.error(f"Error saving matplotlib plots: {e}")
347
+ return []
src/evaluation_config.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Evaluation Settings
3
+ evaluation:
4
+ threshold: 0.8
5
+ verbose_mode: true
6
+ max_retries: 3
7
+ timeout_seconds: 30
8
+
9
+ # Dataset Settings
10
+ dataset:
11
+ input_column: "input"
12
+ output_column: "expected_output"
13
+ context_column: null
14
+
15
+ # Logging Configuration
16
+ logging:
17
+ level: "INFO"
18
+ max_file_size_mb: 10
19
+ backup_count: 5
20
+
21
+ # Output Settings
22
+ output:
23
+ results_dir: "results"
24
+ logs_dir: "logs"
25
+ save_detailed_results: true
26
+
27
+ # Metrics Configuration
28
+ metrics:
29
+ answer_relevancy:
30
+ enabled: true
31
+ threshold: 0.8
32
+ verbose_mode: true
33
+
34
+ contextual_precision:
35
+ enabled: false
36
+ threshold: 0.8
37
+
38
+ # RAG System Configuration
39
+ rag_system:
40
+ embedding_model: "qwen3-embedding"
41
+ reranker_model: "qwen3-reranker"
42
+ llm_model: "openai/gpt-oss-120b" # Groq model
43
+ vector_store: "qdrant"
44
+ chunk_size: 512
45
+ chunk_overlap: 50
46
+ max_context_chunks: 5
47
+ similarity_threshold: 0.7
48
+ rerank_top_k: 20
49
+ final_top_k: 5
50
+
51
+ # API Configuration
52
+ apis:
53
+ groq_api_key: "${GROQ_API_KEY}"
54
+ siliconflow_api_key: "${SILICONFLOW_API_KEY}"
55
+ qdrant_url: "${QDRANT_URL}"
56
+ qdrant_api_key: "${QDRANT_API_KEY}"
57
+
58
+ # Document Processing Configuration
59
+ document_processing:
60
+ pdf_engine: "pymupdf"
61
+ excel_engine: "openpyxl"
62
+ ocr_engine: "tesseract"
63
+ image_processing: true
64
+ table_extraction: true
65
+ max_file_size_mb: 100
66
+
67
+ # Storage Configuration
68
+ storage:
69
+ qdrant_collection: "manufacturing_docs"
70
+ metadata_db_path: "./data/metadata.db"
71
+ file_storage_path: "./data/documents"
72
+ enable_compression: true
src/excel_processor.py ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Dict, List, Any, Optional, Tuple
4
+ import openpyxl
5
+ from openpyxl.worksheet.worksheet import Worksheet
6
+ from openpyxl.drawing.image import Image as OpenpyxlImage
7
+ from dataclasses import dataclass
8
+ import io
9
+ from PIL import Image as PILImage
10
+ import sys
11
+ import os
12
+
13
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+
15
+ from src.document_processor import (
16
+ DocumentProcessor,
17
+ ProcessedDocument,
18
+ DocumentType,
19
+ ProcessingStatus,
20
+ DocumentProcessingError,
21
+ ExtractedImage,
22
+ ExtractedTable,
23
+ DocumentProcessorFactory
24
+ )
25
+
26
+
27
+ try:
28
+ from logger.custom_logger import CustomLoggerTracker
29
+ custom_log = CustomLoggerTracker()
30
+ logger = custom_log.get_logger("excel_processor")
31
+
32
+ except ImportError:
33
+ # Fallback to standard logging if custom logger not available
34
+ logger = logging.getLogger("excel_processor")
35
+
36
+ @dataclass
37
+ class ExcelWorksheetInfo:
38
+ """Information about an Excel worksheet."""
39
+ name: str
40
+ max_row: int
41
+ max_column: int
42
+ cell_count: int
43
+ image_count: int
44
+ table_count: int
45
+ has_data: bool
46
+
47
+
48
+ @dataclass
49
+ class CellRange:
50
+ """Represents a range of cells in Excel."""
51
+ start_row: int
52
+ start_col: int
53
+ end_row: int
54
+ end_col: int
55
+
56
+ def to_excel_range(self) -> str:
57
+ """Convert to Excel range notation (e.g., 'A1:C5')."""
58
+ start_col_letter = openpyxl.utils.get_column_letter(self.start_col)
59
+ end_col_letter = openpyxl.utils.get_column_letter(self.end_col)
60
+ return f"{start_col_letter}{self.start_row}:{end_col_letter}{self.end_row}"
61
+
62
+
63
+ class ExcelProcessor(DocumentProcessor):
64
+ """
65
+ Excel document processor using openpyxl.
66
+
67
+ This processor extracts data from Excel worksheets, embedded images,
68
+ and maintains proper citations with worksheet names and cell references.
69
+ """
70
+
71
+ def __init__(self, config: Dict[str, Any]):
72
+ """
73
+ Initialize the Excel processor.
74
+
75
+ Args:
76
+ config: Configuration dictionary containing Excel processing settings
77
+ """
78
+ super().__init__(config)
79
+ self.extract_images = config.get('image_processing', True)
80
+ self.extract_tables = config.get('table_extraction', True)
81
+ self.min_table_rows = config.get('min_table_rows', 2)
82
+ self.min_table_cols = config.get('min_table_cols', 2)
83
+ self.max_empty_rows = config.get('max_empty_rows', 5)
84
+ self.max_empty_cols = config.get('max_empty_cols', 5)
85
+
86
+ logger.info(f"Excel processor initialized with image_processing={self.extract_images}, "
87
+ f"table_extraction={self.extract_tables}")
88
+
89
+ def _get_supported_extensions(self) -> List[str]:
90
+ """Get supported file extensions for Excel processor."""
91
+ return ['.xlsx', '.xls', '.xlsm']
92
+
93
+ def process_document(self, file_path: str) -> ProcessedDocument:
94
+ """
95
+ Process an Excel document and extract all content.
96
+
97
+ Args:
98
+ file_path: Path to the Excel file
99
+
100
+ Returns:
101
+ ProcessedDocument with extracted content and metadata
102
+
103
+ Raises:
104
+ DocumentProcessingError: If Excel processing fails
105
+ """
106
+ try:
107
+ # Validate file first
108
+ self.validate_file(file_path)
109
+
110
+ # Generate document ID
111
+ document_id = self._generate_document_id(file_path)
112
+
113
+ logger.info(f"Processing Excel document: {file_path}")
114
+
115
+ # Open Excel workbook
116
+ workbook = openpyxl.load_workbook(file_path, data_only=True)
117
+
118
+ try:
119
+ # Extract metadata
120
+ metadata = self._extract_metadata(workbook, file_path)
121
+
122
+ # Process all worksheets
123
+ all_content = []
124
+ all_images = []
125
+ all_tables = []
126
+ worksheet_info = []
127
+
128
+ for sheet_name in workbook.sheetnames:
129
+ worksheet = workbook[sheet_name]
130
+
131
+ logger.debug(f"Processing worksheet: {sheet_name}")
132
+
133
+ # Extract data from worksheet
134
+ sheet_content = self._extract_worksheet_content(worksheet, sheet_name)
135
+ if sheet_content.strip():
136
+ all_content.append(f"[Worksheet: {sheet_name}]\n{sheet_content}")
137
+
138
+ # Extract images if enabled
139
+ if self.extract_images:
140
+ sheet_images = self._extract_worksheet_images(worksheet, sheet_name, document_id)
141
+ all_images.extend(sheet_images)
142
+
143
+ # Extract tables if enabled
144
+ if self.extract_tables:
145
+ sheet_tables = self._extract_worksheet_tables(worksheet, sheet_name)
146
+ all_tables.extend(sheet_tables)
147
+
148
+ # Collect worksheet info
149
+ worksheet_info.append(ExcelWorksheetInfo(
150
+ name=sheet_name,
151
+ max_row=worksheet.max_row or 0,
152
+ max_column=worksheet.max_column or 0,
153
+ cell_count=self._count_non_empty_cells(worksheet),
154
+ image_count=len(sheet_images) if self.extract_images else 0,
155
+ table_count=len(sheet_tables) if self.extract_tables else 0,
156
+ has_data=bool(sheet_content.strip())
157
+ ))
158
+
159
+ # Combine all content
160
+ full_content = "\n\n".join(all_content)
161
+
162
+ # Update metadata with processing info
163
+ metadata.update({
164
+ 'total_worksheets': len(workbook.sheetnames),
165
+ 'worksheet_names': workbook.sheetnames,
166
+ 'total_images': len(all_images),
167
+ 'total_tables': len(all_tables),
168
+ 'total_content_length': len(full_content),
169
+ 'worksheet_info': [
170
+ {
171
+ 'name': info.name,
172
+ 'max_row': info.max_row,
173
+ 'max_column': info.max_column,
174
+ 'cell_count': info.cell_count,
175
+ 'image_count': info.image_count,
176
+ 'table_count': info.table_count,
177
+ 'has_data': info.has_data
178
+ }
179
+ for info in worksheet_info
180
+ ]
181
+ })
182
+
183
+ # Create processed document
184
+ processed_doc = ProcessedDocument(
185
+ document_id=document_id,
186
+ filename=Path(file_path).name,
187
+ file_path=file_path,
188
+ document_type=DocumentType.EXCEL,
189
+ content=full_content,
190
+ metadata=metadata,
191
+ images=all_images,
192
+ tables=all_tables,
193
+ processing_status=ProcessingStatus.COMPLETED
194
+ )
195
+
196
+ logger.info(f"Successfully processed Excel: {len(workbook.sheetnames)} worksheets, "
197
+ f"{len(all_images)} images, {len(all_tables)} tables")
198
+
199
+ return processed_doc
200
+
201
+ finally:
202
+ workbook.close()
203
+
204
+ except Exception as e:
205
+ logger.error(f"Failed to process Excel {file_path}: {e}")
206
+
207
+ # Create failed document
208
+ document_id = self._generate_document_id(file_path)
209
+ return ProcessedDocument(
210
+ document_id=document_id,
211
+ filename=Path(file_path).name,
212
+ file_path=file_path,
213
+ document_type=DocumentType.EXCEL,
214
+ content="",
215
+ metadata={},
216
+ processing_status=ProcessingStatus.FAILED,
217
+ error_message=str(e)
218
+ )
219
+
220
+
221
+ def _extract_metadata(self, workbook: openpyxl.Workbook, file_path: str) -> Dict[str, Any]:
222
+ metadata = {}
223
+ try:
224
+ props = workbook.properties
225
+ if props.title:
226
+ metadata['title'] = props.title
227
+ if props.creator:
228
+ metadata['creator'] = props.creator
229
+ if props.description:
230
+ metadata['description'] = props.description
231
+ if props.subject:
232
+ metadata['subject'] = props.subject
233
+ if props.keywords:
234
+ metadata['keywords'] = props.keywords
235
+ if props.category:
236
+ metadata['category'] = props.category
237
+ if props.created:
238
+ metadata['created'] = props.created.isoformat()
239
+ if props.modified:
240
+ metadata['modified'] = props.modified.isoformat()
241
+ if props.lastModifiedBy:
242
+ metadata['last_modified_by'] = props.lastModifiedBy
243
+
244
+ # Workbook info
245
+ metadata['worksheet_count'] = len(workbook.sheetnames)
246
+ metadata['active_sheet'] = workbook.active.title if workbook.active else None
247
+
248
+ # File info
249
+ file_path_obj = Path(file_path)
250
+ metadata['file_size'] = file_path_obj.stat().st_size
251
+ metadata['file_extension'] = file_path_obj.suffix
252
+
253
+ except Exception as e:
254
+ logger.warning(f"Failed to extract Excel metadata: {e}")
255
+ metadata['metadata_extraction_error'] = str(e)
256
+
257
+ return metadata
258
+
259
+ def _extract_worksheet_content(self, worksheet: Worksheet, sheet_name: str) -> str:
260
+ try:
261
+ content_lines = []
262
+
263
+ if not worksheet.max_row or worksheet.max_row == 1:
264
+ return ""
265
+
266
+ # Iterate through rows and columns
267
+ for row in worksheet.iter_rows(min_row=1, max_row=worksheet.max_row,
268
+ min_col=1, max_col=worksheet.max_column,
269
+ values_only=True):
270
+
271
+ # Convert row values to strings, handling None values
272
+ row_values = []
273
+ for cell_value in row:
274
+ if cell_value is not None:
275
+ # Handle different data types
276
+ if isinstance(cell_value, (int, float)):
277
+ row_values.append(str(cell_value))
278
+ elif isinstance(cell_value, str):
279
+ row_values.append(cell_value.strip())
280
+ else:
281
+ row_values.append(str(cell_value))
282
+ else:
283
+ row_values.append("")
284
+
285
+ # Skip completely empty rows
286
+ if any(val.strip() for val in row_values if val):
287
+ # Join non-empty values with tabs
288
+ row_text = "\t".join(row_values)
289
+ content_lines.append(row_text)
290
+
291
+ return "\n".join(content_lines)
292
+
293
+ except Exception as e:
294
+ logger.warning(f"Failed to extract content from worksheet {sheet_name}: {e}")
295
+ return ""
296
+
297
+ def _extract_worksheet_images(self, worksheet: Worksheet, sheet_name: str, document_id: str) -> List[ExtractedImage]:
298
+ images = []
299
+
300
+ try:
301
+ # Get images from worksheet
302
+ if hasattr(worksheet, '_images') and worksheet._images:
303
+ for img_index, img in enumerate(worksheet._images):
304
+ try:
305
+ # Extract image data
306
+ image_data = self._extract_image_data(img)
307
+ if not image_data:
308
+ continue
309
+
310
+ # Create image object
311
+ image_id = f"{document_id}_{sheet_name}_img{img_index}"
312
+ filename = f"{sheet_name}_image{img_index}.{image_data['format'].lower()}"
313
+
314
+ # Get image position if available
315
+ anchor_info = {}
316
+ if hasattr(img, 'anchor') and img.anchor:
317
+ if hasattr(img.anchor, '_from'):
318
+ anchor_info['from_cell'] = f"{img.anchor._from.col}{img.anchor._from.row}"
319
+ if hasattr(img.anchor, 'to'):
320
+ anchor_info['to_cell'] = f"{img.anchor.to.col}{img.anchor.to.row}"
321
+
322
+ extracted_image = ExtractedImage(
323
+ image_id=image_id,
324
+ filename=filename,
325
+ content=image_data['content'],
326
+ format=image_data['format'],
327
+ width=image_data.get('width'),
328
+ height=image_data.get('height'),
329
+ extraction_method="openpyxl",
330
+ metadata={
331
+ 'worksheet_name': sheet_name,
332
+ 'image_index': img_index,
333
+ 'size_bytes': len(image_data['content']),
334
+ 'anchor_info': anchor_info
335
+ }
336
+ )
337
+
338
+ images.append(extracted_image)
339
+
340
+ except Exception as e:
341
+ logger.warning(f"Failed to extract image {img_index} from worksheet {sheet_name}: {e}")
342
+ continue
343
+
344
+ except Exception as e:
345
+ logger.warning(f"Failed to extract images from worksheet {sheet_name}: {e}")
346
+
347
+ return images
348
+
349
+ def _extract_image_data(self, img: OpenpyxlImage) -> Optional[Dict[str, Any]]:
350
+ """
351
+ Extract data from an openpyxl Image object.
352
+
353
+ Args:
354
+ img: openpyxl Image object
355
+
356
+ Returns:
357
+ Dictionary with image data or None if extraction fails
358
+ """
359
+ try:
360
+ # Get image data
361
+ if hasattr(img, 'ref') and img.ref:
362
+ # Image has reference to external file
363
+ image_data = img.ref
364
+ elif hasattr(img, '_data') and img._data:
365
+ # Image data is embedded
366
+ image_data = img._data()
367
+ else:
368
+ logger.warning("No image data found in image object")
369
+ return None
370
+
371
+ # Determine format
372
+ image_format = "PNG" # Default
373
+ if hasattr(img, 'format') and img.format:
374
+ image_format = img.format.upper()
375
+
376
+ # Try to get dimensions using PIL
377
+ width, height = None, None
378
+ try:
379
+ with io.BytesIO(image_data) as img_buffer:
380
+ pil_img = PILImage.open(img_buffer)
381
+ width, height = pil_img.size
382
+ except Exception as e:
383
+ logger.debug(f"Could not determine image dimensions: {e}")
384
+
385
+ return {
386
+ 'content': image_data,
387
+ 'format': image_format,
388
+ 'width': width,
389
+ 'height': height
390
+ }
391
+
392
+ except Exception as e:
393
+ logger.warning(f"Failed to extract image data: {e}")
394
+ return None
395
+
396
+ def _extract_worksheet_tables(self, worksheet: Worksheet, sheet_name: str) -> List[ExtractedTable]:
397
+ """
398
+ Extract tables from an Excel worksheet.
399
+
400
+ Args:
401
+ worksheet: openpyxl Worksheet object
402
+ sheet_name: Name of the worksheet
403
+
404
+ Returns:
405
+ List of ExtractedTable objects
406
+ """
407
+ tables = []
408
+
409
+ try:
410
+ # First, try to extract defined tables
411
+ if hasattr(worksheet, 'tables') and worksheet.tables:
412
+ for table_name, table in worksheet.tables.items():
413
+ try:
414
+ extracted_table = self._extract_defined_table(table, sheet_name, len(tables))
415
+ if extracted_table:
416
+ tables.append(extracted_table)
417
+ except Exception as e:
418
+ logger.warning(f"Failed to extract defined table {table_name}: {e}")
419
+
420
+ # If no defined tables found, try to detect tables from data
421
+ if not tables and self.extract_tables:
422
+ detected_tables = self._detect_data_tables(worksheet, sheet_name)
423
+ tables.extend(detected_tables)
424
+
425
+ except Exception as e:
426
+ logger.warning(f"Failed to extract tables from worksheet {sheet_name}: {e}")
427
+
428
+ return tables
429
+
430
+ def _extract_defined_table(self, table, sheet_name: str, table_index: int) -> Optional[ExtractedTable]:
431
+ """
432
+ Extract a defined Excel table.
433
+
434
+ Args:
435
+ table: Excel table object
436
+ sheet_name: Name of the worksheet
437
+ table_index: Index of the table
438
+
439
+ Returns:
440
+ ExtractedTable object or None if extraction fails
441
+ """
442
+ try:
443
+ # Get table range
444
+ table_range = table.ref
445
+
446
+ # Parse range (e.g., "A1:C10")
447
+ start_cell, end_cell = table_range.split(':')
448
+
449
+ # Get table data from worksheet
450
+ worksheet = table.parent
451
+ table_data = []
452
+
453
+ for row in worksheet[table_range]:
454
+ row_data = []
455
+ for cell in row:
456
+ value = cell.value if cell.value is not None else ""
457
+ row_data.append(str(value))
458
+ table_data.append(row_data)
459
+
460
+ if not table_data:
461
+ return None
462
+
463
+ # First row is typically headers
464
+ headers = table_data[0] if table_data else []
465
+ rows = table_data[1:] if len(table_data) > 1 else []
466
+
467
+ # Create table object
468
+ table_id = f"{sheet_name}_table{table_index}"
469
+
470
+ return ExtractedTable(
471
+ table_id=table_id,
472
+ headers=headers,
473
+ rows=rows,
474
+ worksheet_name=sheet_name,
475
+ cell_range=table_range,
476
+ extraction_confidence=0.9, # High confidence for defined tables
477
+ metadata={
478
+ 'extraction_method': 'defined_table',
479
+ 'table_index': table_index,
480
+ 'table_name': getattr(table, 'name', '')
481
+ }
482
+ )
483
+
484
+ except Exception as e:
485
+ logger.warning(f"Failed to extract defined table: {e}")
486
+ return None
487
+
488
+ def _detect_data_tables(self, worksheet: Worksheet, sheet_name: str) -> List[ExtractedTable]:
489
+ """
490
+ Detect tables from worksheet data patterns.
491
+
492
+ Args:
493
+ worksheet: openpyxl Worksheet object
494
+ sheet_name: Name of the worksheet
495
+
496
+ Returns:
497
+ List of detected ExtractedTable objects
498
+ """
499
+ tables = []
500
+
501
+ try:
502
+ if not worksheet.max_row or worksheet.max_row < self.min_table_rows:
503
+ return tables
504
+
505
+ # Simple table detection: look for contiguous data blocks
506
+ data_blocks = self._find_data_blocks(worksheet)
507
+
508
+ for block_index, data_block in enumerate(data_blocks):
509
+ if len(data_block) >= self.min_table_rows and len(data_block[0]) >= self.min_table_cols:
510
+ # Create table from data block
511
+ headers = data_block[0]
512
+ rows = data_block[1:]
513
+
514
+ # Calculate cell range
515
+ start_row = 1 # This is simplified - in reality would need to track actual positions
516
+ end_row = start_row + len(data_block) - 1
517
+ start_col = 1
518
+ end_col = len(headers)
519
+
520
+ cell_range = CellRange(start_row, start_col, end_row, end_col).to_excel_range()
521
+
522
+ table_id = f"{sheet_name}_detected_table{block_index}"
523
+
524
+ table = ExtractedTable(
525
+ table_id=table_id,
526
+ headers=headers,
527
+ rows=rows,
528
+ worksheet_name=sheet_name,
529
+ cell_range=cell_range,
530
+ extraction_confidence=0.7, # Lower confidence for detected tables
531
+ metadata={
532
+ 'extraction_method': 'data_pattern_detection',
533
+ 'table_index': block_index
534
+ }
535
+ )
536
+
537
+ tables.append(table)
538
+
539
+ except Exception as e:
540
+ logger.warning(f"Failed to detect data tables: {e}")
541
+
542
+ return tables
543
+
544
+ def _find_data_blocks(self, worksheet: Worksheet) -> List[List[List[str]]]:
545
+ """
546
+ Find contiguous blocks of data in the worksheet.
547
+
548
+ Args:
549
+ worksheet: openpyxl Worksheet object
550
+
551
+ Returns:
552
+ List of data blocks, where each block is a list of rows
553
+ """
554
+ data_blocks = []
555
+
556
+ try:
557
+ current_block = []
558
+ empty_row_count = 0
559
+
560
+ for row in worksheet.iter_rows(min_row=1, max_row=worksheet.max_row,
561
+ min_col=1, max_col=worksheet.max_column,
562
+ values_only=True):
563
+
564
+ # Convert row to strings
565
+ row_values = []
566
+ has_data = False
567
+
568
+ for cell_value in row:
569
+ if cell_value is not None:
570
+ row_values.append(str(cell_value).strip())
571
+ if str(cell_value).strip():
572
+ has_data = True
573
+ else:
574
+ row_values.append("")
575
+
576
+ if has_data:
577
+ # Reset empty row count and add to current block
578
+ empty_row_count = 0
579
+ current_block.append(row_values)
580
+ else:
581
+ # Empty row
582
+ empty_row_count += 1
583
+
584
+ # If we've seen too many empty rows, end the current block
585
+ if empty_row_count >= self.max_empty_rows and current_block:
586
+ if len(current_block) >= self.min_table_rows:
587
+ data_blocks.append(current_block)
588
+ current_block = []
589
+ empty_row_count = 0
590
+
591
+ # Add final block if it exists
592
+ if current_block and len(current_block) >= self.min_table_rows:
593
+ data_blocks.append(current_block)
594
+
595
+ except Exception as e:
596
+ logger.warning(f"Failed to find data blocks: {e}")
597
+
598
+ return data_blocks
599
+
600
+ def _count_non_empty_cells(self, worksheet: Worksheet) -> int:
601
+ """
602
+ Count non-empty cells in a worksheet.
603
+
604
+ Args:
605
+ worksheet: openpyxl Worksheet object
606
+
607
+ Returns:
608
+ Number of non-empty cells
609
+ """
610
+ count = 0
611
+
612
+ try:
613
+ for row in worksheet.iter_rows(values_only=True):
614
+ for cell_value in row:
615
+ if cell_value is not None and str(cell_value).strip():
616
+ count += 1
617
+ except Exception as e:
618
+ logger.warning(f"Failed to count non-empty cells: {e}")
619
+
620
+ return count
621
+
622
+
623
+ # Register the Excel processor
624
+ DocumentProcessorFactory.register_processor(DocumentType.EXCEL, ExcelProcessor)
625
+
626
+
627
+ if __name__=="__main__":
628
+ logger.info(f"Start excel processing...")
src/gradio_demo.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from pathlib import Path
6
+ import tempfile
7
+ import time
8
+ import json
9
+ import logging
10
+ import os
11
+ import sys
12
+ from typing import Dict, Any, Tuple
13
+ from datetime import datetime
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+
17
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
+
19
+
20
+ try:
21
+ from src.config import Config
22
+ from src.ingestion_pipeline import DocumentIngestionPipeline, IngestionResult
23
+ from src.rag_engine import RAGEngine, RAGResponse
24
+ from src.metadata_manager import MetadataManager
25
+ from src.document_processor import ProcessingStatus
26
+ from src.embedding_system import EmbeddingSystem
27
+ from src.vector_store import QdrantVectorStore
28
+ from src.groq_client import LLMSystem
29
+ from logger.custom_logger import CustomLoggerTracker
30
+ custom_log = CustomLoggerTracker()
31
+ logger = custom_log.get_logger("gradio_demo")
32
+
33
+ except ImportError:
34
+ # Fallback to standard logging if custom logger not available
35
+ logger = logging.getLogger("gradio_demo")
36
+
37
+
38
+ class RAGGradioDemo:
39
+ """
40
+ Gradio demo application for the Manufacturing RAG Agent.
41
+ This demo provides a user-friendly interface for document upload,
42
+ question answering, and result visualization using Gradio.
43
+ """
44
+ def __init__(self):
45
+ """Initialize the RAG demo application."""
46
+ self.config = None
47
+ self.ingestion_pipeline = None
48
+ self.rag_engine = None
49
+ self.metadata_manager = None
50
+ self.embedding_system = None
51
+ self.vector_store = None
52
+ self.llm_system = None
53
+
54
+ # Demo state
55
+ self.chat_history = []
56
+ self.documents = []
57
+ self.system_initialized = False
58
+
59
+ def initialize_system(self) -> Tuple[bool, str]:
60
+ """
61
+ Initialize the RAG system components.
62
+
63
+ Returns:
64
+ Tuple of (success, message)
65
+ """
66
+ current_dir = os.path.dirname(os.path.abspath(__file__))
67
+ src_dir = os.path.dirname(current_dir)
68
+ try:
69
+ # Check if required modules are imported
70
+ if Config is None:
71
+ return False, "RAG modules not imported. Please ensure all src/ modules are available and properly structured."
72
+
73
+ # Check for config file in multiple locations
74
+ config_paths = [
75
+ "config.yaml",
76
+ "src/config.yaml",
77
+ os.path.join(current_dir, "config.yaml"),
78
+ os.path.join(src_dir, "config.yaml")
79
+ ]
80
+
81
+ config_path = None
82
+ for path in config_paths:
83
+ if os.path.exists(path):
84
+ config_path = path
85
+ break
86
+
87
+ if not config_path:
88
+ available_files = []
89
+ for search_dir in [current_dir, src_dir]:
90
+ if os.path.exists(search_dir):
91
+ files = [f for f in os.listdir(search_dir) if f.endswith('.yaml') or f.endswith('.yml')]
92
+ if files:
93
+ available_files.extend([os.path.join(search_dir, f) for f in files])
94
+
95
+ error_msg = f"Configuration file not found. Searched: {config_paths}"
96
+ if available_files:
97
+ error_msg += f"\nAvailable config files: {available_files}"
98
+ return False, error_msg
99
+
100
+
101
+ logger.info(f"Using config file: {config_path}")
102
+
103
+ # Load configuration
104
+ self.config = Config(config_path)
105
+
106
+ # Initialize components
107
+ config_dict = {
108
+ 'siliconflow_api_key': self.config.siliconflow_api_key,
109
+ 'groq_api_key': self.config.groq_api_key,
110
+ 'qdrant_url': self.config.qdrant_url,
111
+ 'qdrant_api_key': self.config.qdrant_api_key,
112
+ **self.config.rag_config,
113
+ **self.config.document_processing_config,
114
+ **self.config.storage_config
115
+ }
116
+
117
+ # Initialize core components
118
+ self.embedding_system = EmbeddingSystem(config_dict)
119
+ self.vector_store = QdrantVectorStore(config_dict)
120
+ self.llm_system = LLMSystem(config_dict)
121
+ self.ingestion_pipeline = DocumentIngestionPipeline(config_dict)
122
+ self.rag_engine = RAGEngine(config_dict)
123
+ self.metadata_manager = MetadataManager(config_dict)
124
+
125
+ self.system_initialized = True
126
+ return True, "RAG system initialized successfully!"
127
+
128
+ except Exception as e:
129
+ error_msg = f"Failed to initialize RAG system: {str(e)}"
130
+ logger.error(error_msg)
131
+ logger.error(f"Error details: {type(e).__name__}")
132
+ return False, error_msg
133
+
134
+ def process_uploaded_files(self, files) -> Tuple[str, pd.DataFrame]:
135
+ """
136
+ Process uploaded files through the ingestion pipeline.
137
+
138
+ Args:
139
+ files: List of uploaded file objects
140
+
141
+ Returns:
142
+ Tuple of (status_message, results_dataframe)
143
+ """
144
+ if not self.system_initialized:
145
+ return "❌ System not initialized. Please initialize first.", pd.DataFrame()
146
+
147
+ if not files:
148
+ return "No files uploaded.", pd.DataFrame()
149
+
150
+ results = []
151
+ total_files = len(files)
152
+
153
+ try:
154
+ for i, file in enumerate(files):
155
+ # Save uploaded file temporarily
156
+ temp_path = None
157
+ try:
158
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.name).suffix) as tmp_file:
159
+ tmp_file.write(file.read())
160
+ temp_path = tmp_file.name
161
+
162
+ # Process document
163
+ result = self.ingestion_pipeline.ingest_document(temp_path)
164
+
165
+ # Add result info
166
+ results.append({
167
+ 'Filename': file.name,
168
+ 'Status': '✅ Success' if result.success else '❌ Failed',
169
+ 'Chunks Created': result.chunks_created,
170
+ 'Chunks Indexed': result.chunks_indexed,
171
+ 'Processing Time (s)': f"{result.processing_time:.2f}",
172
+ 'Error Message': result.error_message or 'None'
173
+ })
174
+
175
+ except Exception as e:
176
+ results.append({
177
+ 'Filename': file.name,
178
+ 'Status': '❌ Failed',
179
+ 'Chunks Created': 0,
180
+ 'Chunks Indexed': 0,
181
+ 'Processing Time (s)': '0.00',
182
+ 'Error Message': str(e)
183
+ })
184
+
185
+ finally:
186
+ # Clean up temporary file
187
+ if temp_path and os.path.exists(temp_path):
188
+ os.unlink(temp_path)
189
+
190
+ # Create results summary
191
+ successful = sum(1 for r in results if 'Success' in r['Status'])
192
+ total_chunks = sum(r['Chunks Indexed'] for r in results if isinstance(r['Chunks Indexed'], int))
193
+
194
+ status_msg = f"✅ Processing Complete: {successful}/{total_files} files processed successfully. Total chunks indexed: {total_chunks}"
195
+
196
+ return status_msg, pd.DataFrame(results)
197
+
198
+ except Exception as e:
199
+ error_msg = f"❌ Batch processing failed: {str(e)}"
200
+ logger.error(error_msg)
201
+ return error_msg, pd.DataFrame(results) if results else pd.DataFrame()
202
+
203
+ def ask_question(self, question: str, max_results: int = 5,
204
+ similarity_threshold: float = 0.7, document_filter: str = "All") -> Tuple[str, str, pd.DataFrame]:
205
+ """
206
+ Process a question through the RAG engine.
207
+
208
+ Args:
209
+ question: Question to answer
210
+ max_results: Maximum context chunks
211
+ similarity_threshold: Similarity threshold for retrieval
212
+ document_filter: Document type filter
213
+
214
+ Returns:
215
+ Tuple of (answer, citations_info, performance_dataframe)
216
+ """
217
+ if not self.system_initialized:
218
+ return "❌ System not initialized. Please initialize first.", "", pd.DataFrame()
219
+
220
+ if not question.strip():
221
+ return "Please enter a question.", "", pd.DataFrame()
222
+
223
+ try:
224
+ # Check if documents are available
225
+ documents = self.metadata_manager.list_documents(
226
+ status=ProcessingStatus.COMPLETED,
227
+ limit=1
228
+ )
229
+ if not documents:
230
+ return "⚠️ No processed documents available. Please upload and process documents first.", "", pd.DataFrame()
231
+
232
+ # Prepare filters
233
+ filters = {}
234
+ if document_filter != "All":
235
+ filters["document_type"] = document_filter.lower()
236
+
237
+ # Update RAG engine config temporarily
238
+ original_config = {
239
+ 'final_top_k': self.rag_engine.final_top_k,
240
+ 'similarity_threshold': self.rag_engine.similarity_threshold
241
+ }
242
+
243
+ self.rag_engine.final_top_k = max_results
244
+ self.rag_engine.similarity_threshold = similarity_threshold
245
+
246
+ # Get response
247
+ response = self.rag_engine.answer_question(question, filters if filters else None)
248
+
249
+ # Restore original config
250
+ self.rag_engine.final_top_k = original_config['final_top_k']
251
+ self.rag_engine.similarity_threshold = original_config['similarity_threshold']
252
+
253
+ # Add to chat history
254
+ self.chat_history.append((question, response))
255
+
256
+ # Format answer
257
+ if not response.success:
258
+ return f"❌ Failed to generate answer: {response.error_message}", "", pd.DataFrame()
259
+
260
+ # Create citations info
261
+ citations_info = self._format_citations(response.citations)
262
+
263
+ # Create performance dataframe
264
+ performance_data = {
265
+ 'Metric': ['Confidence Score', 'Processing Time (s)', 'Retrieval Time (s)',
266
+ 'Generation Time (s)', 'Rerank Time (s)', 'Sources Used', 'Chunks Retrieved'],
267
+ 'Value': [
268
+ f"{response.confidence_score:.3f}",
269
+ f"{response.processing_time:.3f}",
270
+ f"{response.retrieval_time:.3f}",
271
+ f"{response.generation_time:.3f}",
272
+ f"{response.rerank_time:.3f}",
273
+ len(response.citations),
274
+ response.total_chunks_retrieved
275
+ ]
276
+ }
277
+
278
+ performance_df = pd.DataFrame(performance_data)
279
+
280
+ return response.answer, citations_info, performance_df
281
+
282
+ except Exception as e:
283
+ error_msg = f"❌ Question processing failed: {str(e)}"
284
+ logger.error(error_msg)
285
+ return error_msg, "", pd.DataFrame()
286
+
287
+ def _format_citations(self, citations) -> str:
288
+ """Format citations for display."""
289
+ if not citations:
290
+ return "No citations available."
291
+
292
+ citation_text = "## 📚 Sources & Citations\n\n"
293
+
294
+ for i, citation in enumerate(citations):
295
+ citation_text += f"**Source {i+1}:** {citation.source_file} (Confidence: {citation.confidence:.3f})\n"
296
+
297
+ # Add specific location info
298
+ location_parts = []
299
+ if citation.page_number:
300
+ location_parts.append(f"📄 Page: {citation.page_number}")
301
+ if citation.worksheet_name:
302
+ location_parts.append(f"📊 Sheet: {citation.worksheet_name}")
303
+ if citation.cell_range:
304
+ location_parts.append(f"🔢 Range: {citation.cell_range}")
305
+ if citation.section_title:
306
+ location_parts.append(f"📑 Section: {citation.section_title}")
307
+
308
+ if location_parts:
309
+ citation_text += f"*Location:* {' | '.join(location_parts)}\n"
310
+
311
+ citation_text += f"*Excerpt:* \"{citation.text_snippet}\"\n\n"
312
+
313
+ return citation_text
314
+
315
+ def get_document_library(self) -> pd.DataFrame:
316
+ """Get document library as DataFrame."""
317
+ if not self.system_initialized:
318
+ return pd.DataFrame({'Message': ['System not initialized']})
319
+
320
+ try:
321
+ documents = self.metadata_manager.list_documents(limit=100)
322
+
323
+ if not documents:
324
+ return pd.DataFrame({'Message': ['No documents uploaded yet']})
325
+
326
+ doc_data = []
327
+ for doc in documents:
328
+ doc_data.append({
329
+ 'Filename': doc.filename,
330
+ 'Type': doc.file_type.upper(),
331
+ 'Status': doc.processing_status.value.title(),
332
+ 'Chunks': doc.total_chunks,
333
+ 'Size': self._format_file_size(doc.file_size),
334
+ 'Uploaded': doc.upload_timestamp.strftime('%Y-%m-%d %H:%M'),
335
+ 'Processing Time (s)': f"{doc.processing_time:.2f}" if doc.processing_time else "N/A"
336
+ })
337
+
338
+ return pd.DataFrame(doc_data)
339
+
340
+ except Exception as e:
341
+ logger.error(f"Failed to load document library: {e}")
342
+ return pd.DataFrame({'Error': [str(e)]})
343
+
344
+ def get_system_status(self) -> Tuple[str, pd.DataFrame]:
345
+ """Get system status and health information."""
346
+ if not self.system_initialized:
347
+ return "❌ System not initialized", pd.DataFrame()
348
+
349
+ try:
350
+ # Health checks
351
+ rag_health = self.rag_engine.health_check()
352
+ pipeline_health = self.ingestion_pipeline.health_check()
353
+
354
+ # Create status message
355
+ status_parts = []
356
+ for component, healthy in rag_health.items():
357
+ status = "✅ Healthy" if healthy else "❌ Unhealthy"
358
+ status_parts.append(f"**{component.replace('_', ' ').title()}:** {status}")
359
+
360
+ status_message = "## 🏥 System Health\n" + "\n".join(status_parts)
361
+
362
+ # Create detailed status table
363
+ all_health = {**rag_health, **pipeline_health}
364
+ health_data = []
365
+
366
+ for component, healthy in all_health.items():
367
+ health_data.append({
368
+ 'Component': component.replace('_', ' ').title(),
369
+ 'Status': '✅ Healthy' if healthy else '❌ Unhealthy',
370
+ 'Last Checked': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
371
+ })
372
+
373
+ return status_message, pd.DataFrame(health_data)
374
+
375
+ except Exception as e:
376
+ error_msg = f"❌ Failed to check system status: {str(e)}"
377
+ logger.error(error_msg)
378
+ return error_msg, pd.DataFrame()
379
+
380
+ def get_analytics_data(self) -> Tuple[str, Dict[str, Any]]:
381
+ """Get analytics data for visualization."""
382
+ if not self.system_initialized:
383
+ return "❌ System not initialized", {}
384
+
385
+ try:
386
+ # Get system statistics
387
+ pipeline_stats = self.ingestion_pipeline.get_pipeline_stats()
388
+ metadata_stats = self.metadata_manager.get_statistics()
389
+
390
+ # Create summary message
391
+ total_docs = metadata_stats.get('total_documents', 0)
392
+ total_chunks = metadata_stats.get('total_chunks', 0)
393
+ total_size = metadata_stats.get('total_file_size', 0)
394
+
395
+ summary = f"""## 📊 Analytics Overview
396
+
397
+ **Total Documents:** {total_docs}
398
+ **Total Chunks:** {total_chunks}
399
+ **Total File Size:** {self._format_file_size(total_size)}
400
+ **Vector Points:** {pipeline_stats.get('vector_store', {}).get('total_points', 0)}
401
+ """
402
+
403
+ # Prepare data for charts
404
+ analytics_data = {
405
+ 'document_types': metadata_stats.get('documents_by_type', {}),
406
+ 'processing_status': metadata_stats.get('documents_by_status', {}),
407
+ 'pipeline_stats': pipeline_stats,
408
+ 'metadata_stats': metadata_stats
409
+ }
410
+
411
+ return summary, analytics_data
412
+
413
+ except Exception as e:
414
+ error_msg = f"❌ Failed to load analytics: {str(e)}"
415
+ logger.error(error_msg)
416
+ return error_msg, {}
417
+
418
+ def create_document_type_chart(self, analytics_data: Dict[str, Any]):
419
+ """Create document type distribution chart."""
420
+ if not analytics_data or 'document_types' not in analytics_data:
421
+ return None
422
+
423
+ type_counts = analytics_data['document_types']
424
+ if not type_counts:
425
+ return None
426
+
427
+ fig = px.pie(
428
+ values=list(type_counts.values()),
429
+ names=list(type_counts.keys()),
430
+ title="Documents by Type"
431
+ )
432
+ return fig
433
+
434
+ def create_status_chart(self, analytics_data: Dict[str, Any]):
435
+ """Create processing status chart."""
436
+ if not analytics_data or 'processing_status' not in analytics_data:
437
+ return None
438
+
439
+ status_counts = analytics_data['processing_status']
440
+ if not status_counts:
441
+ return None
442
+
443
+ fig = px.bar(
444
+ x=list(status_counts.keys()),
445
+ y=list(status_counts.values()),
446
+ title="Documents by Processing Status"
447
+ )
448
+ return fig
449
+
450
+ def _format_file_size(self, size_bytes: int) -> str:
451
+ """Format file size in human readable format."""
452
+ if size_bytes == 0:
453
+ return "0B"
454
+
455
+ size_names = ["B", "KB", "MB", "GB", "TB"]
456
+ i = 0
457
+ while size_bytes >= 1024 and i < len(size_names) - 1:
458
+ size_bytes /= 1024.0
459
+ i += 1
460
+
461
+ return f"{size_bytes:.1f}{size_names[i]}"
462
+
463
+
464
+ def create_gradio_interface():
465
+ """Create the main Gradio interface."""
466
+
467
+ # Initialize demo instance
468
+ demo_instance = RAGGradioDemo()
469
+
470
+ # Define the interface
471
+ with gr.Blocks(title="Manufacturing RAG Agent", theme=gr.themes.Soft()) as demo:
472
+ gr.Markdown("""
473
+ # 🏭 Manufacturing RAG Agent
474
+ *Intelligent document analysis for manufacturing data*
475
+
476
+ This system allows you to upload manufacturing documents (PDF, Excel, Images) and ask questions about their content.
477
+ """)
478
+
479
+ # System Status
480
+ with gr.Row():
481
+ with gr.Column(scale=3):
482
+ system_status = gr.Markdown("**System Status:** Not initialized")
483
+ with gr.Column(scale=1):
484
+ init_btn = gr.Button("🚀 Initialize System", variant="primary")
485
+
486
+ # Main tabs
487
+ with gr.Tabs():
488
+ # Document Upload Tab
489
+ with gr.TabItem("📄 Document Upload"):
490
+ gr.Markdown("### Upload and Process Documents")
491
+
492
+ with gr.Row():
493
+ with gr.Column(scale=2):
494
+ file_upload = gr.File(
495
+ file_count="multiple",
496
+ file_types=[".pdf", ".xlsx", ".xls", ".xlsm", ".png", ".jpg", ".jpeg"],
497
+ label="Choose files to upload"
498
+ )
499
+ upload_btn = gr.Button("🔄 Process Documents", variant="primary")
500
+
501
+ with gr.Column(scale=1):
502
+ upload_status = gr.Textbox(
503
+ label="Processing Status",
504
+ interactive=False,
505
+ lines=3
506
+ )
507
+
508
+ # Results display
509
+ upload_results = gr.Dataframe(
510
+ label="Processing Results",
511
+ interactive=False
512
+ )
513
+
514
+ # Document Library
515
+ gr.Markdown("### 📚 Document Library")
516
+ refresh_docs_btn = gr.Button("🔄 Refresh Library")
517
+ doc_library = gr.Dataframe(
518
+ label="Uploaded Documents",
519
+ interactive=False
520
+ )
521
+
522
+ # Question Answering Tab
523
+ with gr.TabItem("❓ Ask Questions"):
524
+ gr.Markdown("### Ask Questions About Your Documents")
525
+
526
+ with gr.Row():
527
+ with gr.Column(scale=2):
528
+ question_input = gr.Textbox(
529
+ label="Your Question",
530
+ placeholder="e.g., What is the average production yield for Q3?",
531
+ lines=2
532
+ )
533
+
534
+ with gr.Row():
535
+ ask_btn = gr.Button("🔍 Ask Question", variant="primary")
536
+ clear_btn = gr.Button("🗑️ Clear")
537
+
538
+ with gr.Column(scale=1):
539
+ gr.Markdown("#### Advanced Options")
540
+ max_results = gr.Slider(
541
+ minimum=1, maximum=10, value=5, step=1,
542
+ label="Max Context Chunks"
543
+ )
544
+ similarity_threshold = gr.Slider(
545
+ minimum=0.0, maximum=1.0, value=0.7, step=0.1,
546
+ label="Similarity Threshold"
547
+ )
548
+ doc_filter = gr.Dropdown(
549
+ choices=["All", "PDF", "Excel", "Image"],
550
+ value="All",
551
+ label="Filter by Document Type"
552
+ )
553
+
554
+ # Answer display
555
+ answer_output = gr.Markdown(label="Answer")
556
+ citations_output = gr.Markdown(label="Citations")
557
+
558
+ # Performance metrics
559
+ performance_metrics = gr.Dataframe(
560
+ label="Performance Metrics",
561
+ interactive=False
562
+ )
563
+
564
+ # Analytics Tab
565
+ with gr.TabItem("📊 Analytics"):
566
+ gr.Markdown("### System Analytics")
567
+
568
+ refresh_analytics_btn = gr.Button("🔄 Refresh Analytics")
569
+ analytics_summary = gr.Markdown("Analytics data will appear here...")
570
+
571
+ with gr.Row():
572
+ doc_type_chart = gr.Plot(label="Document Types")
573
+ status_chart = gr.Plot(label="Processing Status")
574
+
575
+ # System Status Tab
576
+ with gr.TabItem("⚙️ System Status"):
577
+ gr.Markdown("### System Health & Configuration")
578
+
579
+ check_health_btn = gr.Button("🔍 Check System Health")
580
+ health_status = gr.Markdown("System health information will appear here...")
581
+ health_details = gr.Dataframe(
582
+ label="Component Health Details",
583
+ interactive=False
584
+ )
585
+
586
+ # Event handlers
587
+ def initialize_system():
588
+ success, message = demo_instance.initialize_system()
589
+ status_color = "green" if success else "red"
590
+ status_icon = "✅" if success else "❌"
591
+ return f"**System Status:** <span style='color: {status_color}'>{status_icon} {message}</span>"
592
+
593
+ def refresh_document_library():
594
+ return demo_instance.get_document_library()
595
+
596
+ def refresh_analytics():
597
+ summary, data = demo_instance.get_analytics_data()
598
+ doc_chart = demo_instance.create_document_type_chart(data)
599
+ status_chart_fig = demo_instance.create_status_chart(data)
600
+ return summary, doc_chart, status_chart_fig
601
+
602
+ def check_system_health():
603
+ status, details = demo_instance.get_system_status()
604
+ return status, details
605
+
606
+ def clear_question():
607
+ return "", "", gr.Dataframe()
608
+
609
+ # Connect event handlers
610
+ init_btn.click(
611
+ initialize_system,
612
+ outputs=[system_status]
613
+ )
614
+
615
+ upload_btn.click(
616
+ demo_instance.process_uploaded_files,
617
+ inputs=[file_upload],
618
+ outputs=[upload_status, upload_results]
619
+ )
620
+
621
+ refresh_docs_btn.click(
622
+ refresh_document_library,
623
+ outputs=[doc_library]
624
+ )
625
+
626
+ ask_btn.click(
627
+ demo_instance.ask_question,
628
+ inputs=[question_input, max_results, similarity_threshold, doc_filter],
629
+ outputs=[answer_output, citations_output, performance_metrics]
630
+ )
631
+
632
+ clear_btn.click(
633
+ clear_question,
634
+ outputs=[question_input, answer_output, performance_metrics]
635
+ )
636
+
637
+ refresh_analytics_btn.click(
638
+ refresh_analytics,
639
+ outputs=[analytics_summary, doc_type_chart, status_chart]
640
+ )
641
+
642
+ check_health_btn.click(
643
+ check_system_health,
644
+ outputs=[health_status, health_details]
645
+ )
646
+
647
+ # Auto-refresh document library on upload
648
+ upload_btn.click(
649
+ refresh_document_library,
650
+ outputs=[doc_library]
651
+ )
652
+
653
+ return demo
654
+
655
+ def main():
656
+ """Main function to launch the Gradio demo."""
657
+ try:
658
+ # Create and launch the interface
659
+ demo = create_gradio_interface()
660
+
661
+ # Launch with configuration
662
+ demo.launch(
663
+ server_name="0.0.0.0", # Allow external connections
664
+ server_port=7860, # Default Gradio port
665
+ share=False, # Set to True to create public link
666
+ debug=True, # Enable debug mode
667
+ show_error=True, # Show detailed error messages
668
+ quiet=False # Enable logging
669
+ )
670
+
671
+ except Exception as e:
672
+ print(f"Failed to launch Gradio demo: {e}")
673
+ print("Please ensure all dependencies are installed and the src/ directory contains the required modules.")
674
+
675
+ if __name__ == "__main__":
676
+ main()
src/groq_client.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from typing import Dict, List, Any, Optional
4
+ from dataclasses import dataclass
5
+ import requests
6
+ import json
7
+ import os
8
+ import os
9
+ import sys
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+
13
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+ # Setup logging
15
+ try:
16
+ from logger.custom_logger import CustomLoggerTracker
17
+ custom_log = CustomLoggerTracker()
18
+ logger = custom_log.get_logger("groq_client")
19
+
20
+ except ImportError:
21
+ # Fallback to standard logging if custom logger not available
22
+ logger = logging.getLogger("groq_client")
23
+
24
+
25
+
26
+ @dataclass
27
+ class LLMResponse:
28
+ """Response from LLM generation."""
29
+ text: str
30
+ model_name: str
31
+ processing_time: float
32
+ token_count: int
33
+ success: bool
34
+ error_message: Optional[str] = None
35
+ finish_reason: Optional[str] = None
36
+
37
+
38
+ class GroqClient:
39
+ """
40
+ Groq API client for fast LLM inference.
41
+
42
+ This client provides high-speed inference using Groq's LPU architecture
43
+ with support for various models like Llama, Mixtral, and Gemma.
44
+ """
45
+
46
+ def __init__(self, api_key: str, base_url: str = "https://api.groq.com/openai/v1"):
47
+ """
48
+ Initialize the Groq client.
49
+
50
+ Args:
51
+ api_key: Groq API key
52
+ base_url: Base URL for Groq API
53
+ """
54
+ self.api_key = api_key
55
+ self.base_url = base_url.rstrip('/')
56
+ self.session = requests.Session()
57
+ self.session.headers.update({
58
+ 'Authorization': f'Bearer {api_key}',
59
+ 'Content-Type': 'application/json'
60
+ })
61
+
62
+ # Rate limiting
63
+ self.max_requests_per_minute = 30
64
+ self.request_timestamps = []
65
+
66
+ logger.info(f"Groq client initialized with base URL: {base_url}")
67
+
68
+ def generate_response(self, messages: List[Dict[str, str]],
69
+ model: str = "openai/gpt-oss-120b",
70
+ max_tokens: int = 1024,
71
+ temperature: float = 0.1) -> LLMResponse:
72
+ """
73
+ Generate response using Groq LLM.
74
+
75
+ Args:
76
+ messages: List of message dictionaries with 'role' and 'content'
77
+ model: Model name to use
78
+ max_tokens: Maximum tokens to generate
79
+ temperature: Sampling temperature
80
+
81
+ Returns:
82
+ LLMResponse with generated text and metadata
83
+ """
84
+ start_time = time.time()
85
+
86
+ try:
87
+ # Rate limiting check
88
+ self._check_rate_limit()
89
+
90
+ # Prepare request payload
91
+ payload = {
92
+ "model": model,
93
+ "messages": messages,
94
+ "max_tokens": max_tokens,
95
+ "temperature": temperature,
96
+ "stream": False
97
+ }
98
+
99
+ # Make API request
100
+ response = self.session.post(
101
+ f"{self.base_url}/chat/completions",
102
+ json=payload,
103
+ timeout=60
104
+ )
105
+
106
+ processing_time = time.time() - start_time
107
+
108
+ if response.status_code == 200:
109
+ data = response.json()
110
+
111
+ # Extract response text
112
+ choice = data.get('choices', [{}])[0]
113
+ message = choice.get('message', {})
114
+ generated_text = message.get('content', '')
115
+ finish_reason = choice.get('finish_reason', 'unknown')
116
+
117
+ # Get usage info
118
+ usage = data.get('usage', {})
119
+ token_count = usage.get('total_tokens', 0)
120
+
121
+ logger.debug(f"Generated response in {processing_time:.2f}s, {token_count} tokens")
122
+
123
+ return LLMResponse(
124
+ text=generated_text,
125
+ model_name=model,
126
+ processing_time=processing_time,
127
+ token_count=token_count,
128
+ success=True,
129
+ finish_reason=finish_reason
130
+ )
131
+ else:
132
+ error_msg = f"API request failed with status {response.status_code}: {response.text}"
133
+ logger.error(error_msg)
134
+
135
+ return LLMResponse(
136
+ text="",
137
+ model_name=model,
138
+ processing_time=processing_time,
139
+ token_count=0,
140
+ success=False,
141
+ error_message=error_msg
142
+ )
143
+
144
+ except Exception as e:
145
+ processing_time = time.time() - start_time
146
+ error_msg = f"LLM generation failed: {str(e)}"
147
+ logger.error(error_msg)
148
+
149
+ return LLMResponse(
150
+ text="",
151
+ model_name=model,
152
+ processing_time=processing_time,
153
+ token_count=0,
154
+ success=False,
155
+ error_message=error_msg
156
+ )
157
+
158
+ def answer_question(self, question: str, context: str,
159
+ model: str = "openai/gpt-oss-120b") -> LLMResponse:
160
+ """
161
+ Answer a question based on provided context.
162
+
163
+ Args:
164
+ question: Question to answer
165
+ context: Context information
166
+ model: Model name to use
167
+
168
+ Returns:
169
+ LLMResponse with the answer
170
+ """
171
+ # Create system prompt for manufacturing Q&A
172
+ system_prompt = """You are an expert manufacturing analyst assistant. Your task is to answer questions about manufacturing data, processes, and documentation based on the provided context.
173
+
174
+ Guidelines:
175
+ 1. Answer questions accurately based only on the provided context
176
+ 2. If the context doesn't contain enough information, say so clearly
177
+ 3. Include specific references to data points, measurements, or processes when available
178
+ 4. Use technical manufacturing terminology appropriately
179
+ 5. Provide concise but complete answers
180
+ 6. If asked about trends or comparisons, use the numerical data from the context
181
+
182
+ Always cite your sources by mentioning the specific document, page, or section where you found the information."""
183
+
184
+ # Create user prompt with context and question
185
+ user_prompt = f"""Context:
186
+ {context}
187
+
188
+ Question: {question}
189
+
190
+ Please provide a detailed answer based on the context above. Include specific citations where possible."""
191
+
192
+ messages = [
193
+ {"role": "system", "content": system_prompt},
194
+ {"role": "user", "content": user_prompt}
195
+ ]
196
+
197
+ return self.generate_response(messages, model, max_tokens=1024, temperature=0.1)
198
+
199
+ def summarize_document(self, content: str,
200
+ model: str = "openai/gpt-oss-120b") -> LLMResponse:
201
+ system_prompt = """You are an expert at summarizing manufacturing documents. Create concise, informative summaries that capture the key information, data points, and insights from manufacturing documentation."""
202
+
203
+ user_prompt = f"""Please provide a comprehensive summary of the following manufacturing document content:
204
+
205
+ {content}
206
+
207
+ Focus on:
208
+ - Key manufacturing processes described
209
+ - Important measurements, specifications, or data points
210
+ - Quality metrics or performance indicators
211
+ - Any issues, recommendations, or conclusions
212
+ - Critical dates, locations, or responsible parties
213
+
214
+ Keep the summary concise but comprehensive."""
215
+
216
+ messages = [
217
+ {"role": "system", "content": system_prompt},
218
+ {"role": "user", "content": user_prompt}
219
+ ]
220
+
221
+ return self.generate_response(messages, model, max_tokens=512, temperature=0.1)
222
+
223
+ def extract_key_information(self, content: str,
224
+ model: str = "openai/gpt-oss-120b") -> LLMResponse:
225
+ """
226
+ Extract key information from document content.
227
+
228
+ Args:
229
+ content: Document content to analyze
230
+ model: Model name to use
231
+
232
+ Returns:
233
+ LLMResponse with extracted key information
234
+ """
235
+ system_prompt = """You are an expert at extracting key information from manufacturing documents. Identify and extract the most important data points, specifications, processes, and insights."""
236
+
237
+ user_prompt = f"""Extract the key information from the following manufacturing document content:
238
+
239
+ {content}
240
+
241
+ Please organize the extracted information into categories such as:
242
+ - Manufacturing Processes
243
+ - Quality Metrics
244
+ - Specifications/Parameters
245
+ - Performance Data
246
+ - Issues/Problems
247
+ - Recommendations
248
+ - Dates and Timelines
249
+
250
+ Present the information in a structured, easy-to-read format."""
251
+
252
+ messages = [
253
+ {"role": "system", "content": system_prompt},
254
+ {"role": "user", "content": user_prompt}
255
+ ]
256
+
257
+ return self.generate_response(messages, model, max_tokens=768, temperature=0.1)
258
+
259
+ def _check_rate_limit(self):
260
+ """Check and enforce rate limiting."""
261
+ current_time = time.time()
262
+
263
+ # Remove timestamps older than 1 minute
264
+ self.request_timestamps = [
265
+ ts for ts in self.request_timestamps
266
+ if current_time - ts < 60
267
+ ]
268
+
269
+ # Check if we're at the rate limit
270
+ if len(self.request_timestamps) >= self.max_requests_per_minute:
271
+ sleep_time = 60 - (current_time - self.request_timestamps[0])
272
+ if sleep_time > 0:
273
+ logger.warning(f"Rate limit reached, sleeping for {sleep_time:.2f} seconds")
274
+ time.sleep(sleep_time)
275
+
276
+ # Add current request timestamp
277
+ self.request_timestamps.append(current_time)
278
+
279
+ def get_available_models(self) -> List[str]:
280
+ """
281
+ Get list of available models.
282
+
283
+ Returns:
284
+ List of available model names
285
+ """
286
+ try:
287
+ response = self.session.get(f"{self.base_url}/models")
288
+
289
+ if response.status_code == 200:
290
+ data = response.json()
291
+ models = [model['id'] for model in data.get('data', [])]
292
+ return models
293
+ else:
294
+ logger.error(f"Failed to get models: {response.status_code}")
295
+ return []
296
+
297
+ except Exception as e:
298
+ logger.error(f"Failed to get available models: {e}")
299
+ return []
300
+
301
+ def health_check(self) -> bool:
302
+ """
303
+ Check if the Groq API is accessible.
304
+
305
+ Returns:
306
+ True if healthy, False otherwise
307
+ """
308
+ try:
309
+ response = self.session.get(f"{self.base_url}/models", timeout=10)
310
+ return response.status_code == 200
311
+
312
+ except Exception as e:
313
+ logger.error(f"Groq health check failed: {e}")
314
+ return False
315
+
316
+
317
+
318
+
319
+ class LLMSystem:
320
+ def __init__(self, config: Dict[str, Any]):
321
+ self.config = config
322
+ self.api_key = os.getenv('GROQ_API_KEY') or config.get('groq_api_key')
323
+ self.default_model = config.get('llm_model', 'openai/gpt-oss-120b')
324
+ self.max_retries = config.get('max_retries', 3)
325
+ if not self.api_key:
326
+ raise ValueError("Groq API key is required")
327
+ self.client = GroqClient(self.api_key)
328
+ logger.info(f"LLM system initialized with default model: {self.default_model}")
329
+
330
+ def answer_question(self, question: str, context: str, model: Optional[str] = None) -> str:
331
+ model = model or self.default_model
332
+ for attempt in range(self.max_retries):
333
+ try:
334
+ response = self.client.answer_question(question, context, model)
335
+ if response.success:
336
+ return response.text
337
+ else:
338
+ logger.warning(f"LLM generation failed (attempt {attempt + 1}): {response.error_message}")
339
+ if attempt < self.max_retries - 1:
340
+ time.sleep(2 ** attempt) # Exponential backoff
341
+ except Exception as e:
342
+ logger.warning(f"LLM generation error (attempt {attempt + 1}): {e}")
343
+ if attempt < self.max_retries - 1:
344
+ time.sleep(2 ** attempt)
345
+ # Return fallback response if all attempts failed
346
+ return "I apologize, but I'm unable to generate a response at this time due to technical difficulties. Please try again later."
347
+
348
+ def summarize_content(self, content: str, model: Optional[str] = None) -> str:
349
+ model = model or self.default_model
350
+ for attempt in range(self.max_retries):
351
+ try:
352
+ response = self.client.summarize_document(content, model)
353
+ if response.success:
354
+ return response.text
355
+ else:
356
+ logger.warning(f"Summarization failed (attempt {attempt + 1}): {response.error_message}")
357
+ if attempt < self.max_retries - 1:
358
+ time.sleep(2 ** attempt)
359
+ except Exception as e:
360
+ logger.warning(f"Summarization error (attempt {attempt + 1}): {e}")
361
+ if attempt < self.max_retries - 1:
362
+ time.sleep(2 ** attempt)
363
+ return "Unable to generate summary at this time."
364
+
365
+
366
+ if __name__=="__main__":
367
+ logger.info(f"Groq client init ..")
368
+ ## Test code (for demonstration purposes)
369
+ config = {
370
+ 'groq_api_key': os.getenv('GROQ_API_KEY'),
371
+ 'llm_model': 'openai/gpt-oss-120b',
372
+ 'max_retries': 3
373
+ }
374
+ llm_system = LLMSystem(config)
375
+ question = "What is the capital of France?"
376
+ context = "France is a country in Western Europe."
377
+ answer = llm_system.answer_question(question, context)
378
+ logger.info(f"Answer: {answer}")
src/image_processor.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Dict, List, Any, Optional, Tuple
4
+ import pytesseract
5
+ from PIL import Image, ImageEnhance, ImageFilter
6
+ import io
7
+ import numpy as np
8
+ from dataclasses import dataclass
9
+ import os
10
+ import sys
11
+
12
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from src.document_processor import (
15
+ DocumentProcessor,
16
+ ProcessedDocument,
17
+ DocumentType,
18
+ ProcessingStatus,
19
+ DocumentProcessingError,
20
+ ExtractedImage,
21
+ DocumentProcessorFactory
22
+ )
23
+
24
+
25
+ try:
26
+ from logger.custom_logger import CustomLoggerTracker
27
+ custom_log = CustomLoggerTracker()
28
+ logger = custom_log.get_logger("excel_processor")
29
+
30
+ except ImportError:
31
+ # Fallback to standard logging if custom logger not available
32
+ logger = logging.getLogger("excel_processor")
33
+
34
+
35
+ @dataclass
36
+ class OCRResult:
37
+ """Result of OCR processing."""
38
+ text: str
39
+ confidence: float
40
+ word_count: int
41
+ processing_time: float
42
+ preprocessing_applied: List[str]
43
+
44
+
45
+ @dataclass
46
+ class ImageAnalysis:
47
+ """Analysis results for an image."""
48
+ width: int
49
+ height: int
50
+ format: str
51
+ mode: str
52
+ size_bytes: int
53
+ is_grayscale: bool
54
+ average_brightness: float
55
+ contrast_level: float
56
+ estimated_dpi: Optional[int] = None
57
+
58
+
59
+ class ImageProcessor(DocumentProcessor):
60
+ """
61
+ Image processor with OCR capabilities using Tesseract.
62
+
63
+ This processor handles standalone image files and provides OCR text extraction
64
+ with preprocessing to improve accuracy.
65
+ """
66
+
67
+ def __init__(self, config: Dict[str, Any]):
68
+ """
69
+ Initialize the image processor.
70
+
71
+ Args:
72
+ config: Configuration dictionary containing image processing settings
73
+ """
74
+ super().__init__(config)
75
+ self.ocr_engine = config.get('ocr_engine', 'tesseract')
76
+ self.ocr_language = config.get('ocr_language', 'eng')
77
+ self.preprocessing_enabled = config.get('image_preprocessing', True)
78
+ self.min_confidence = config.get('min_ocr_confidence', 30.0)
79
+ self.max_image_size = config.get('max_image_size', (3000, 3000))
80
+ self.enhance_contrast = config.get('enhance_contrast', True)
81
+ self.enhance_sharpness = config.get('enhance_sharpness', True)
82
+
83
+ # Verify Tesseract installation
84
+ self._verify_tesseract()
85
+
86
+ logger.info(f"Image processor initialized with OCR language: {self.ocr_language}, "
87
+ f"preprocessing: {self.preprocessing_enabled}")
88
+
89
+ def _get_supported_extensions(self) -> List[str]:
90
+ """Get supported file extensions for image processor."""
91
+ return ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.tif']
92
+
93
+ def _verify_tesseract(self) -> None:
94
+ """Verify that Tesseract is properly installed and accessible."""
95
+ try:
96
+ version = pytesseract.get_tesseract_version()
97
+ logger.info(f"Tesseract version: {version}")
98
+ except Exception as e:
99
+ logger.error(f"Tesseract not found or not properly installed: {e}")
100
+ raise DocumentProcessingError(
101
+ "tesseract",
102
+ "InstallationError",
103
+ f"Tesseract OCR engine not found: {e}"
104
+ )
105
+
106
+ def process_document(self, file_path: str) -> ProcessedDocument:
107
+ """
108
+ Process an image file and extract text using OCR.
109
+
110
+ Args:
111
+ file_path: Path to the image file
112
+
113
+ Returns:
114
+ ProcessedDocument with extracted text and metadata
115
+
116
+ Raises:
117
+ DocumentProcessingError: If image processing fails
118
+ """
119
+ try:
120
+ # Validate file first
121
+ self.validate_file(file_path)
122
+
123
+ # Generate document ID
124
+ document_id = self._generate_document_id(file_path)
125
+
126
+ logger.info(f"Processing image document: {file_path}")
127
+
128
+ # Load and analyze image
129
+ image = Image.open(file_path)
130
+ image_analysis = self._analyze_image(image, file_path)
131
+
132
+ # Preprocess image if enabled
133
+ processed_image = image
134
+ preprocessing_steps = []
135
+
136
+ if self.preprocessing_enabled:
137
+ processed_image, preprocessing_steps = self._preprocess_image(image)
138
+
139
+ # Perform OCR
140
+ ocr_result = self._perform_ocr(processed_image)
141
+
142
+ # Create extracted image object
143
+ with open(file_path, 'rb') as f:
144
+ image_content = f.read()
145
+
146
+ extracted_image = ExtractedImage(
147
+ image_id=f"{document_id}_main",
148
+ filename=Path(file_path).name,
149
+ content=image_content,
150
+ format=image_analysis.format,
151
+ width=image_analysis.width,
152
+ height=image_analysis.height,
153
+ ocr_text=ocr_result.text,
154
+ ocr_confidence=ocr_result.confidence,
155
+ extraction_method="tesseract_ocr",
156
+ metadata={
157
+ 'image_analysis': {
158
+ 'mode': image_analysis.mode,
159
+ 'size_bytes': image_analysis.size_bytes,
160
+ 'is_grayscale': image_analysis.is_grayscale,
161
+ 'average_brightness': image_analysis.average_brightness,
162
+ 'contrast_level': image_analysis.contrast_level,
163
+ 'estimated_dpi': image_analysis.estimated_dpi
164
+ },
165
+ 'ocr_result': {
166
+ 'word_count': ocr_result.word_count,
167
+ 'processing_time': ocr_result.processing_time,
168
+ 'preprocessing_applied': ocr_result.preprocessing_applied
169
+ }
170
+ }
171
+ )
172
+
173
+ # Create metadata
174
+ metadata = {
175
+ 'image_analysis': image_analysis.__dict__,
176
+ 'ocr_result': ocr_result.__dict__,
177
+ 'preprocessing_steps': preprocessing_steps,
178
+ 'ocr_language': self.ocr_language,
179
+ 'ocr_engine': self.ocr_engine
180
+ }
181
+
182
+ # Create processed document
183
+ processed_doc = ProcessedDocument(
184
+ document_id=document_id,
185
+ filename=Path(file_path).name,
186
+ file_path=file_path,
187
+ document_type=DocumentType.IMAGE,
188
+ content=ocr_result.text,
189
+ metadata=metadata,
190
+ images=[extracted_image],
191
+ processing_status=ProcessingStatus.COMPLETED
192
+ )
193
+
194
+ logger.info(f"Successfully processed image: {len(ocr_result.text)} characters extracted, "
195
+ f"confidence: {ocr_result.confidence:.1f}%")
196
+
197
+ return processed_doc
198
+
199
+ except Exception as e:
200
+ logger.error(f"Failed to process image {file_path}: {e}")
201
+
202
+ # Create failed document
203
+ document_id = self._generate_document_id(file_path)
204
+ return ProcessedDocument(
205
+ document_id=document_id,
206
+ filename=Path(file_path).name,
207
+ file_path=file_path,
208
+ document_type=DocumentType.IMAGE,
209
+ content="",
210
+ metadata={},
211
+ processing_status=ProcessingStatus.FAILED,
212
+ error_message=str(e)
213
+ )
214
+
215
+ def process_extracted_image(self, extracted_image: ExtractedImage) -> ExtractedImage:
216
+ """
217
+ Process an already extracted image (e.g., from PDF or Excel) with OCR.
218
+
219
+ Args:
220
+ extracted_image: ExtractedImage object to process
221
+
222
+ Returns:
223
+ Updated ExtractedImage with OCR text
224
+ """
225
+ try:
226
+ logger.debug(f"Processing extracted image: {extracted_image.image_id}")
227
+
228
+ # Load image from bytes
229
+ image = Image.open(io.BytesIO(extracted_image.content))
230
+
231
+ # Preprocess image if enabled
232
+ processed_image = image
233
+ preprocessing_steps = []
234
+
235
+ if self.preprocessing_enabled:
236
+ processed_image, preprocessing_steps = self._preprocess_image(image)
237
+
238
+ # Perform OCR
239
+ ocr_result = self._perform_ocr(processed_image)
240
+
241
+ # Update extracted image with OCR results
242
+ extracted_image.ocr_text = ocr_result.text
243
+ extracted_image.ocr_confidence = ocr_result.confidence
244
+
245
+ # Update metadata
246
+ if 'ocr_result' not in extracted_image.metadata:
247
+ extracted_image.metadata['ocr_result'] = {}
248
+
249
+ extracted_image.metadata['ocr_result'].update({
250
+ 'word_count': ocr_result.word_count,
251
+ 'processing_time': ocr_result.processing_time,
252
+ 'preprocessing_applied': preprocessing_steps,
253
+ 'ocr_language': self.ocr_language,
254
+ 'ocr_engine': self.ocr_engine
255
+ })
256
+
257
+ logger.debug(f"OCR completed for {extracted_image.image_id}: "
258
+ f"{len(ocr_result.text)} characters, confidence: {ocr_result.confidence:.1f}%")
259
+
260
+ return extracted_image
261
+
262
+ except Exception as e:
263
+ logger.warning(f"Failed to process extracted image {extracted_image.image_id}: {e}")
264
+
265
+ # Return original image with error info
266
+ extracted_image.metadata['ocr_error'] = str(e)
267
+ return extracted_image
268
+
269
+ def _analyze_image(self, image: Image.Image, file_path: str) -> ImageAnalysis:
270
+ """
271
+ Analyze image properties and characteristics.
272
+
273
+ Args:
274
+ image: PIL Image object
275
+ file_path: Path to the image file
276
+
277
+ Returns:
278
+ ImageAnalysis object with image properties
279
+ """
280
+ try:
281
+ # Basic properties
282
+ width, height = image.size
283
+ format_name = image.format or Path(file_path).suffix[1:].upper()
284
+ mode = image.mode
285
+
286
+ # File size
287
+ size_bytes = Path(file_path).stat().st_size
288
+
289
+ # Convert to grayscale for analysis
290
+ if image.mode != 'L':
291
+ gray_image = image.convert('L')
292
+ else:
293
+ gray_image = image
294
+
295
+ # Calculate brightness and contrast
296
+ np_image = np.array(gray_image)
297
+ average_brightness = np.mean(np_image)
298
+ contrast_level = np.std(np_image)
299
+
300
+ # Check if image is grayscale
301
+ is_grayscale = mode in ['L', '1'] or (mode == 'RGB' and self._is_grayscale_rgb(image))
302
+
303
+ # Estimate DPI if available
304
+ estimated_dpi = None
305
+ if hasattr(image, 'info') and 'dpi' in image.info:
306
+ estimated_dpi = image.info['dpi'][0] if isinstance(image.info['dpi'], tuple) else image.info['dpi']
307
+
308
+ return ImageAnalysis(
309
+ width=width,
310
+ height=height,
311
+ format=format_name,
312
+ mode=mode,
313
+ size_bytes=size_bytes,
314
+ is_grayscale=is_grayscale,
315
+ average_brightness=float(average_brightness),
316
+ contrast_level=float(contrast_level),
317
+ estimated_dpi=estimated_dpi
318
+ )
319
+
320
+ except Exception as e:
321
+ logger.warning(f"Failed to analyze image: {e}")
322
+ # Return basic analysis
323
+ return ImageAnalysis(
324
+ width=image.size[0],
325
+ height=image.size[1],
326
+ format=image.format or "UNKNOWN",
327
+ mode=image.mode,
328
+ size_bytes=0,
329
+ is_grayscale=False,
330
+ average_brightness=128.0,
331
+ contrast_level=50.0
332
+ )
333
+
334
+ def _is_grayscale_rgb(self, image: Image.Image) -> bool:
335
+ """
336
+ Check if an RGB image is actually grayscale.
337
+
338
+ Args:
339
+ image: PIL Image object in RGB mode
340
+
341
+ Returns:
342
+ True if image is grayscale, False otherwise
343
+ """
344
+ try:
345
+ # Sample a few pixels to check if R=G=B
346
+ sample_size = min(100, image.size[0] * image.size[1])
347
+ pixels = list(image.getdata())
348
+
349
+ # Check first 'sample_size' pixels
350
+ for i in range(0, min(sample_size, len(pixels))):
351
+ r, g, b = pixels[i][:3] # Handle RGBA by taking only RGB
352
+ if r != g or g != b:
353
+ return False
354
+
355
+ return True
356
+
357
+ except Exception:
358
+ return False
359
+
360
+ def _preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, List[str]]:
361
+ """
362
+ Preprocess image to improve OCR accuracy.
363
+
364
+ Args:
365
+ image: PIL Image object
366
+
367
+ Returns:
368
+ Tuple of (processed_image, list_of_applied_steps)
369
+ """
370
+ processed_image = image.copy()
371
+ applied_steps = []
372
+
373
+ try:
374
+ # Resize if image is too large
375
+ if image.size[0] > self.max_image_size[0] or image.size[1] > self.max_image_size[1]:
376
+ processed_image.thumbnail(self.max_image_size, Image.Resampling.LANCZOS)
377
+ applied_steps.append("resize")
378
+
379
+ # Convert to grayscale if not already
380
+ if processed_image.mode != 'L':
381
+ processed_image = processed_image.convert('L')
382
+ applied_steps.append("grayscale_conversion")
383
+
384
+ # Enhance contrast if enabled
385
+ if self.enhance_contrast:
386
+ enhancer = ImageEnhance.Contrast(processed_image)
387
+ processed_image = enhancer.enhance(1.5) # Increase contrast by 50%
388
+ applied_steps.append("contrast_enhancement")
389
+
390
+ # Enhance sharpness if enabled
391
+ if self.enhance_sharpness:
392
+ enhancer = ImageEnhance.Sharpness(processed_image)
393
+ processed_image = enhancer.enhance(1.2) # Increase sharpness by 20%
394
+ applied_steps.append("sharpness_enhancement")
395
+
396
+ # Apply noise reduction
397
+ processed_image = processed_image.filter(ImageFilter.MedianFilter(size=3))
398
+ applied_steps.append("noise_reduction")
399
+
400
+ except Exception as e:
401
+ logger.warning(f"Error during image preprocessing: {e}")
402
+ # Return original image if preprocessing fails
403
+ return image, ["preprocessing_failed"]
404
+
405
+ return processed_image, applied_steps
406
+
407
+ def _perform_ocr(self, image: Image.Image) -> OCRResult:
408
+ """
409
+ Perform OCR on the processed image.
410
+
411
+ Args:
412
+ image: PIL Image object
413
+
414
+ Returns:
415
+ OCRResult with extracted text and metadata
416
+ """
417
+ import time
418
+
419
+ start_time = time.time()
420
+
421
+ try:
422
+ # Configure Tesseract
423
+ custom_config = r'--oem 3 --psm 6' # Use LSTM OCR Engine Mode with uniform text block
424
+
425
+ # Get text with confidence scores
426
+ data = pytesseract.image_to_data(
427
+ image,
428
+ lang=self.ocr_language,
429
+ config=custom_config,
430
+ output_type=pytesseract.Output.DICT
431
+ )
432
+
433
+ # Extract text and calculate average confidence
434
+ words = []
435
+ confidences = []
436
+
437
+ for i, conf in enumerate(data['conf']):
438
+ if int(conf) > 0: # Only include words with confidence > 0
439
+ word = data['text'][i].strip()
440
+ if word: # Only include non-empty words
441
+ words.append(word)
442
+ confidences.append(int(conf))
443
+
444
+ # Combine words into text
445
+ extracted_text = ' '.join(words)
446
+
447
+ # Calculate average confidence
448
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
449
+
450
+ # Calculate processing time
451
+ processing_time = time.time() - start_time
452
+
453
+ # Clean up text
454
+ extracted_text = self._clean_ocr_text(extracted_text)
455
+
456
+ return OCRResult(
457
+ text=extracted_text,
458
+ confidence=avg_confidence,
459
+ word_count=len(words),
460
+ processing_time=processing_time,
461
+ preprocessing_applied=[] # Will be filled by caller
462
+ )
463
+
464
+ except Exception as e:
465
+ logger.error(f"OCR processing failed: {e}")
466
+ processing_time = time.time() - start_time
467
+
468
+ return OCRResult(
469
+ text="",
470
+ confidence=0.0,
471
+ word_count=0,
472
+ processing_time=processing_time,
473
+ preprocessing_applied=[]
474
+ )
475
+
476
+ def _clean_ocr_text(self, text: str) -> str:
477
+ """
478
+ Clean and normalize OCR extracted text.
479
+
480
+ Args:
481
+ text: Raw OCR text
482
+
483
+ Returns:
484
+ Cleaned text
485
+ """
486
+ if not text:
487
+ return ""
488
+
489
+ # Remove excessive whitespace
490
+ import re
491
+ text = re.sub(r'\s+', ' ', text)
492
+
493
+ # Remove common OCR artifacts
494
+ text = text.replace('|', 'I') # Common misrecognition
495
+ text = text.replace('0', 'O') # In some contexts
496
+
497
+ # Strip leading/trailing whitespace
498
+ text = text.strip()
499
+
500
+ return text
501
+
502
+ def batch_process_images(self, image_list: List[ExtractedImage]) -> List[ExtractedImage]:
503
+ """
504
+ Process multiple extracted images in batch.
505
+
506
+ Args:
507
+ image_list: List of ExtractedImage objects
508
+
509
+ Returns:
510
+ List of processed ExtractedImage objects with OCR text
511
+ """
512
+ processed_images = []
513
+
514
+ logger.info(f"Starting batch OCR processing for {len(image_list)} images")
515
+
516
+ for i, extracted_image in enumerate(image_list):
517
+ try:
518
+ logger.debug(f"Processing image {i+1}/{len(image_list)}: {extracted_image.image_id}")
519
+ processed_image = self.process_extracted_image(extracted_image)
520
+ processed_images.append(processed_image)
521
+
522
+ except Exception as e:
523
+ logger.warning(f"Failed to process image {extracted_image.image_id}: {e}")
524
+ # Add original image with error info
525
+ extracted_image.metadata['batch_processing_error'] = str(e)
526
+ processed_images.append(extracted_image)
527
+
528
+ logger.info(f"Completed batch OCR processing: {len(processed_images)} images processed")
529
+ return processed_images
530
+
531
+
532
+ # Register the Image processor
533
+ DocumentProcessorFactory.register_processor(DocumentType.IMAGE, ImageProcessor)
534
+
535
+
536
+
537
+ if __name__=="__main__":
538
+ logger.info(f"Image processor init ..")
src/ingestion_pipeline.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Optional, Tuple
5
+ from dataclasses import dataclass
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ import hashlib
8
+ import os
9
+ import sys
10
+
11
+
12
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from src.document_processor import (
15
+ DocumentProcessor, DocumentProcessorFactory, ProcessedDocument,
16
+ DocumentChunk, ProcessingStatus, DocumentType
17
+ )
18
+ from src.embedding_system import EmbeddingSystem
19
+ from src.vector_store import QdrantVectorStore
20
+ from src.metadata_manager import MetadataManager, DocumentMetadata
21
+ from src.image_processor import ImageProcessor
22
+
23
+
24
+ try:
25
+ from logger.custom_logger import CustomLoggerTracker
26
+ custom_log = CustomLoggerTracker()
27
+ logger = custom_log.get_logger("ingestion_pipeline")
28
+
29
+ except ImportError:
30
+ # Fallback to standard logging if custom logger not available
31
+ logger = logging.getLogger("ingestion_pipeline")
32
+
33
+
34
+ @dataclass
35
+ class IngestionResult:
36
+ """Result of document ingestion."""
37
+ document_id: str
38
+ filename: str
39
+ success: bool
40
+ processing_time: float
41
+ chunks_created: int
42
+ chunks_indexed: int
43
+ error_message: Optional[str] = None
44
+ warnings: List[str] = None
45
+
46
+ def __post_init__(self):
47
+ if self.warnings is None:
48
+ self.warnings = []
49
+
50
+
51
+ @dataclass
52
+ class IngestionStats:
53
+ """Statistics for batch ingestion."""
54
+ total_documents: int
55
+ successful_documents: int
56
+ failed_documents: int
57
+ total_chunks: int
58
+ total_processing_time: float
59
+ average_processing_time: float
60
+ documents_by_type: Dict[str, int]
61
+ errors: List[str]
62
+
63
+
64
+
65
+ def jina_embeddings(text: str) -> List[float]:
66
+ JINA_API_KEY= "jina_a75b55a8a9524bb697ea016b164211ebF5IduSgA0Ku8lmI0pS9fnXoZ83Su"
67
+ import requests
68
+
69
+ headers = {
70
+ 'Content-Type': 'application/json',
71
+ 'Authorization': 'Bearer jina_a75b55a8a9524bb697ea016b164211ebF5IduSgA0Ku8lmI0pS9fnXoZ83Su'}
72
+
73
+ data = {
74
+ "model": "jina-embeddings-v3",
75
+ "task": "retrieval.passage",
76
+ "input": text}
77
+
78
+ response = requests.post('https://api.jina.ai/v1/embeddings', headers=headers, json=data)
79
+ return response.json()['data'][0]['embedding']
80
+
81
+
82
+ class DocumentIngestionPipeline:
83
+ def __init__(self, config: Dict[str, Any]):
84
+ self.config = config
85
+ # Initialize components
86
+ self.embedding_system = EmbeddingSystem(config)
87
+ self.vector_store = QdrantVectorStore(config)
88
+ self.metadata_manager = MetadataManager(config)
89
+ # Initialize components with correct vector dimensions
90
+ self.vector_size = config.get('vector_size', 1024) # Match Jina's dimension
91
+ self.config['vector_size'] = self.vector_size # Update config for other components
92
+
93
+ # Initialize image processor for OCR
94
+ self.image_processor = ImageProcessor(config)
95
+
96
+ # Pipeline settings
97
+ self.chunk_size = config.get('chunk_size', 512)
98
+ self.chunk_overlap = config.get('chunk_overlap', 50)
99
+ self.batch_size = config.get('embedding_batch_size', 32)
100
+ self.max_workers = config.get('max_workers', 4)
101
+ self.enable_ocr = config.get('image_processing', True)
102
+
103
+ logger.info(f"Document ingestion pipeline initialized")
104
+
105
+ def ingest_document(self, file_path: str, document_id: Optional[str] = None) -> IngestionResult:
106
+ """
107
+ Ingest a single document through the complete pipeline.
108
+
109
+ Args:
110
+ file_path: Path to the document file
111
+ document_id: Optional custom document ID
112
+
113
+ Returns:
114
+ IngestionResult with processing details
115
+ """
116
+ start_time = time.time()
117
+ file_path_obj = Path(file_path)
118
+ filename = file_path_obj.name
119
+
120
+ try:
121
+ logger.info(f"Starting ingestion of document: {filename}")
122
+
123
+ # Generate document ID if not provided
124
+ if not document_id:
125
+ document_id = self._generate_document_id(file_path)
126
+
127
+ # Check if document already exists
128
+ existing_metadata = self.metadata_manager.get_document_metadata(document_id)
129
+ if existing_metadata and existing_metadata.processing_status == ProcessingStatus.COMPLETED:
130
+ logger.info(f"Document {filename} already processed, skipping")
131
+ return IngestionResult(
132
+ document_id=document_id,
133
+ filename=filename,
134
+ success=True,
135
+ processing_time=0.0,
136
+ chunks_created=existing_metadata.total_chunks,
137
+ chunks_indexed=existing_metadata.total_chunks,
138
+ warnings=["Document already processed"]
139
+ )
140
+
141
+ # Step 1: Process document
142
+ processed_doc = self._process_document(file_path)
143
+ if processed_doc.processing_status == ProcessingStatus.FAILED:
144
+ return IngestionResult(
145
+ document_id=document_id,
146
+ filename=filename,
147
+ success=False,
148
+ processing_time=time.time() - start_time,
149
+ chunks_created=0,
150
+ chunks_indexed=0,
151
+ error_message=processed_doc.error_message
152
+ )
153
+
154
+ # Step 2: Process images with OCR if enabled
155
+ if self.enable_ocr and processed_doc.images:
156
+ processed_doc.images = self.image_processor.batch_process_images(processed_doc.images)
157
+
158
+ # Step 3: Create document chunks
159
+ processor = DocumentProcessorFactory.create_processor(file_path, self.config)
160
+ chunks = processor.extract_chunks(processed_doc, self.chunk_size, self.chunk_overlap)
161
+
162
+ if not chunks:
163
+ logger.warning(f"No chunks created for document: {filename}")
164
+ return IngestionResult(
165
+ document_id=document_id,
166
+ filename=filename,
167
+ success=False,
168
+ processing_time=time.time() - start_time,
169
+ chunks_created=0,
170
+ chunks_indexed=0,
171
+ error_message="No content chunks could be created"
172
+ )
173
+
174
+
175
+
176
+
177
+ # Step 4: Generate embeddings
178
+ chunk_texts = [chunk.content for chunk in chunks]
179
+ logger.info(chunk_texts[:2])
180
+ # embeddings = self.embedding_system.generate_embeddings(chunk_texts)
181
+ embeddings = [jina_embeddings(text) for text in chunk_texts]
182
+
183
+
184
+ if not embeddings or len(embeddings) != len(chunks):
185
+ logger.error(f"Embedding generation failed for document: {filename}")
186
+ return IngestionResult(
187
+ document_id=document_id,
188
+ filename=filename,
189
+ success=False,
190
+ processing_time=time.time() - start_time,
191
+ chunks_created=len(chunks),
192
+ chunks_indexed=0,
193
+ error_message="Failed to generate embeddings"
194
+ )
195
+
196
+ # Attach embeddings to chunks
197
+ for chunk, embedding in zip(chunks, embeddings):
198
+ chunk.embedding = embedding
199
+
200
+
201
+
202
+ # Step 5: Store in vector database
203
+ vector_success = self.vector_store.add_documents(chunks)
204
+ if not vector_success:
205
+ logger.error(f"Failed to store vectors for document: {filename}")
206
+ return IngestionResult(
207
+ document_id=document_id,
208
+ filename=filename,
209
+ success=False,
210
+ processing_time=time.time() - start_time,
211
+ chunks_created=len(chunks),
212
+ chunks_indexed=0,
213
+ error_message="Failed to store document vectors"
214
+ )
215
+
216
+ # Step 6: Store metadata
217
+ processing_time = time.time() - start_time
218
+ metadata = DocumentMetadata(
219
+ document_id=document_id,
220
+ filename=filename,
221
+ file_path=file_path,
222
+ file_type=processed_doc.document_type.value,
223
+ upload_timestamp=processed_doc.processing_timestamp,
224
+ processing_status=ProcessingStatus.COMPLETED,
225
+ total_chunks=len(chunks),
226
+ file_size=processed_doc.file_size,
227
+ checksum=processed_doc.checksum,
228
+ processing_time=processing_time,
229
+ metadata_json=self._serialize_metadata(processed_doc.metadata)
230
+ )
231
+
232
+ metadata_success = self.metadata_manager.store_document_metadata(document_id, metadata)
233
+ if not metadata_success:
234
+ logger.warning(f"Failed to store metadata for document: {filename}")
235
+
236
+ logger.info(f"Successfully ingested document {filename}: {len(chunks)} chunks in {processing_time:.2f}s")
237
+
238
+ return IngestionResult(
239
+ document_id=document_id,
240
+ filename=filename,
241
+ success=True,
242
+ processing_time=processing_time,
243
+ chunks_created=len(chunks),
244
+ chunks_indexed=len(chunks)
245
+ )
246
+
247
+ except Exception as e:
248
+ error_msg = f"Ingestion failed for {filename}: {str(e)}"
249
+ logger.error(error_msg)
250
+
251
+ # Update metadata with error status
252
+ if document_id:
253
+ self.metadata_manager.update_document_status(
254
+ document_id,
255
+ ProcessingStatus.FAILED,
256
+ error_msg,
257
+ time.time() - start_time
258
+ )
259
+
260
+ return IngestionResult(
261
+ document_id=document_id or "unknown",
262
+ filename=filename,
263
+ success=False,
264
+ processing_time=time.time() - start_time,
265
+ chunks_created=0,
266
+ chunks_indexed=0,
267
+ error_message=error_msg
268
+ )
269
+
270
+ def ingest_batch(self, file_paths: List[str], max_workers: Optional[int] = None) -> IngestionStats:
271
+ """
272
+ Ingest multiple documents in parallel.
273
+
274
+ Args:
275
+ file_paths: List of file paths to process
276
+ max_workers: Maximum number of worker threads
277
+
278
+ Returns:
279
+ IngestionStats with batch processing results
280
+ """
281
+ start_time = time.time()
282
+ max_workers = max_workers or self.max_workers
283
+
284
+ logger.info(f"Starting batch ingestion of {len(file_paths)} documents with {max_workers} workers")
285
+
286
+ results = []
287
+ errors = []
288
+ documents_by_type = {}
289
+
290
+ # Process documents in parallel
291
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
292
+ # Submit all tasks
293
+ future_to_path = {
294
+ executor.submit(self.ingest_document, file_path): file_path
295
+ for file_path in file_paths
296
+ }
297
+
298
+ # Collect results
299
+ for future in as_completed(future_to_path):
300
+ file_path = future_to_path[future]
301
+ try:
302
+ result = future.result()
303
+ results.append(result)
304
+
305
+ # Track document types
306
+ file_ext = Path(file_path).suffix.lower()
307
+ documents_by_type[file_ext] = documents_by_type.get(file_ext, 0) + 1
308
+
309
+ if not result.success:
310
+ errors.append(f"{result.filename}: {result.error_message}")
311
+
312
+ except Exception as e:
313
+ error_msg = f"Failed to process {file_path}: {str(e)}"
314
+ errors.append(error_msg)
315
+ logger.error(error_msg)
316
+
317
+ # Calculate statistics
318
+ successful_results = [r for r in results if r.success]
319
+ failed_results = [r for r in results if not r.success]
320
+
321
+ total_processing_time = time.time() - start_time
322
+ total_chunks = sum(r.chunks_indexed for r in successful_results)
323
+ avg_processing_time = (
324
+ sum(r.processing_time for r in results) / len(results)
325
+ if results else 0.0
326
+ )
327
+
328
+ stats = IngestionStats(
329
+ total_documents=len(file_paths),
330
+ successful_documents=len(successful_results),
331
+ failed_documents=len(failed_results),
332
+ total_chunks=total_chunks,
333
+ total_processing_time=total_processing_time,
334
+ average_processing_time=avg_processing_time,
335
+ documents_by_type=documents_by_type,
336
+ errors=errors
337
+ )
338
+
339
+ logger.info(f"Batch ingestion completed: {stats.successful_documents}/{stats.total_documents} "
340
+ f"documents processed successfully in {total_processing_time:.2f}s")
341
+
342
+ return stats
343
+
344
+ def reprocess_document(self, document_id: str) -> IngestionResult:
345
+ """
346
+ Reprocess an existing document.
347
+
348
+ Args:
349
+ document_id: ID of the document to reprocess
350
+
351
+ Returns:
352
+ IngestionResult with reprocessing details
353
+ """
354
+ # Get existing metadata
355
+ metadata = self.metadata_manager.get_document_metadata(document_id)
356
+ if not metadata:
357
+ return IngestionResult(
358
+ document_id=document_id,
359
+ filename="unknown",
360
+ success=False,
361
+ processing_time=0.0,
362
+ chunks_created=0,
363
+ chunks_indexed=0,
364
+ error_message="Document not found in metadata"
365
+ )
366
+
367
+ # Delete existing vectors
368
+ self.vector_store.delete_document(document_id)
369
+
370
+ # Reprocess the document
371
+ return self.ingest_document(metadata.file_path, document_id)
372
+
373
+ def delete_document(self, document_id: str) -> bool:
374
+ """
375
+ Delete a document and all associated data.
376
+
377
+ Args:
378
+ document_id: ID of the document to delete
379
+
380
+ Returns:
381
+ True if successful, False otherwise
382
+ """
383
+ try:
384
+ # Delete from vector store
385
+ vector_success = self.vector_store.delete_document(document_id)
386
+
387
+ # Delete from metadata
388
+ metadata_success = self.metadata_manager.delete_document(document_id)
389
+
390
+ success = vector_success and metadata_success
391
+ if success:
392
+ logger.info(f"Successfully deleted document: {document_id}")
393
+ else:
394
+ logger.warning(f"Partial deletion of document: {document_id}")
395
+
396
+ return success
397
+
398
+ except Exception as e:
399
+ logger.error(f"Failed to delete document {document_id}: {e}")
400
+ return False
401
+
402
+ def _process_document(self, file_path: str) -> ProcessedDocument:
403
+ try:
404
+ processor = DocumentProcessorFactory.create_processor(file_path, self.config)
405
+ return processor.process_document(file_path)
406
+
407
+ except Exception as e:
408
+ logger.error(f"Document processing failed for {file_path}: {e}")
409
+
410
+ # Return failed document
411
+ document_id = self._generate_document_id(file_path)
412
+ return ProcessedDocument(
413
+ document_id=document_id,
414
+ filename=Path(file_path).name,
415
+ file_path=file_path,
416
+ document_type=DocumentType.UNKNOWN,
417
+ content="",
418
+ metadata={},
419
+ processing_status=ProcessingStatus.FAILED,
420
+ error_message=str(e)
421
+ )
422
+
423
+ def _generate_document_id(self, file_path: str) -> str:
424
+ # Use file path and modification time for uniqueness
425
+ file_path_obj = Path(file_path)
426
+ if file_path_obj.exists():
427
+ mtime = file_path_obj.stat().st_mtime
428
+ content = f"{file_path}_{mtime}"
429
+ else:
430
+ content = f"{file_path}_{time.time()}"
431
+
432
+ return hashlib.md5(content.encode()).hexdigest()
433
+
434
+ def _serialize_metadata(self, metadata: Dict[str, Any]) -> str:
435
+ try:
436
+ import json
437
+ return json.dumps(metadata, default=str, ensure_ascii=False)
438
+ except Exception as e:
439
+ logger.warning(f"Failed to serialize metadata: {e}")
440
+ return "{}"
441
+
442
+
443
+
444
+ def get_pipeline_stats(self) -> Dict[str, Any]:
445
+ """
446
+ Get statistics about the ingestion pipeline.
447
+
448
+ Returns:
449
+ Dictionary with pipeline statistics
450
+ """
451
+ try:
452
+ # Get component statistics
453
+ vector_stats = self.vector_store.get_collection_info()
454
+ metadata_stats = self.metadata_manager.get_statistics()
455
+ embedding_stats = self.embedding_system.get_cache_stats()
456
+
457
+ return {
458
+ "vector_store": vector_stats.__dict__ if vector_stats else {},
459
+ "metadata_manager": metadata_stats,
460
+ "embedding_system": embedding_stats,
461
+ "pipeline_config": {
462
+ "chunk_size": self.chunk_size,
463
+ "chunk_overlap": self.chunk_overlap,
464
+ "batch_size": self.batch_size,
465
+ "max_workers": self.max_workers,
466
+ "enable_ocr": self.enable_ocr
467
+ }
468
+ }
469
+
470
+ except Exception as e:
471
+ logger.error(f"Failed to get pipeline stats: {e}")
472
+ return {"error": str(e)}
473
+
474
+ def health_check(self) -> Dict[str, bool]:
475
+ """
476
+ Check health of all pipeline components.
477
+
478
+ Returns:
479
+ Dictionary with health status of each component
480
+ """
481
+ return {
482
+ "vector_store": self.vector_store.health_check(),
483
+ "metadata_manager": True, # SQLite is always available if file system works
484
+ "embedding_system": True # Will be checked during actual usage
485
+ }
486
+
487
+
488
+
489
+ if __name__=="__main__":
490
+ logger.info(f"Ingestion Pipe init ..")
491
+
492
+ ## Example usage
493
+ import yaml
494
+ with open("src/config.yaml", 'r') as f:
495
+ config = yaml.safe_load(f)
496
+ pipeline = DocumentIngestionPipeline(config)
497
+ stats = pipeline.get_pipeline_stats()
498
+ logger.info(f"Pipeline stats: {stats}")
499
+ # Example single document ingestion
500
+ result = pipeline.ingest_document("data/documents/3.수불확인등록.xlsx")
501
+ logger.info(f"Ingestion result: {result}")
502
+ # Example batch ingestion
503
+ # batch_result = pipeline.ingest_batch(["sample_data/sample.pdf", "sample_data/sample.docx"])
504
+ # logger.info(f"Batch ingestion stats: {batch_result}")
505
+
src/metadata_manager.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sqlite3
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Optional
5
+ from dataclasses import dataclass, asdict
6
+ from datetime import datetime
7
+ import json
8
+ import os
9
+ import sys
10
+
11
+
12
+
13
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+
15
+ from src.document_processor import ProcessingStatus, DocumentType
16
+
17
+
18
+ try:
19
+ from logger.custom_logger import CustomLoggerTracker
20
+ custom_log = CustomLoggerTracker()
21
+ logger = custom_log.get_logger("meta_manager")
22
+
23
+ except ImportError:
24
+ # Fallback to standard logging if custom logger not available
25
+ logger = logging.getLogger("meta_manager")
26
+
27
+
28
+ @dataclass
29
+ class DocumentMetadata:
30
+ """Metadata for a processed document."""
31
+ document_id: str
32
+ filename: str
33
+ file_path: str
34
+ file_type: str
35
+ upload_timestamp: datetime
36
+ processing_status: ProcessingStatus
37
+ total_chunks: int
38
+ file_size: int
39
+ checksum: str
40
+ error_message: Optional[str] = None
41
+ processing_time: Optional[float] = None
42
+ metadata_json: Optional[str] = None # Additional metadata as JSON
43
+
44
+
45
+ @dataclass
46
+ class CitationInfo:
47
+ """Citation information for a document chunk."""
48
+ chunk_id: str
49
+ document_id: str
50
+ source_document: str
51
+ location_reference: str
52
+ extraction_method: str
53
+ confidence_level: float
54
+ page_number: Optional[int] = None
55
+ worksheet_name: Optional[str] = None
56
+ cell_range: Optional[str] = None
57
+ section_title: Optional[str] = None
58
+
59
+
60
+ class MetadataManager:
61
+ """
62
+ SQLite-based metadata manager for document tracking and citation management.
63
+
64
+ This manager provides persistent storage for document metadata, processing status,
65
+ and citation information with efficient querying capabilities.
66
+ """
67
+
68
+ def __init__(self, config: Dict[str, Any]):
69
+ """
70
+ Initialize the metadata manager.
71
+
72
+ Args:
73
+ config: Configuration dictionary containing database settings
74
+ """
75
+ self.config = config
76
+ self.db_path = config.get('metadata_db_path', './data/metadata.db')
77
+
78
+ # Ensure database directory exists
79
+ Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
80
+
81
+ # Initialize database
82
+ self._init_database()
83
+
84
+ logger.info(f"Metadata manager initialized with database: {self.db_path}")
85
+
86
+ def _init_database(self):
87
+ """Initialize the SQLite database with required tables."""
88
+ try:
89
+ with sqlite3.connect(self.db_path) as conn:
90
+ cursor = conn.cursor()
91
+
92
+ # Create documents table
93
+ cursor.execute('''
94
+ CREATE TABLE IF NOT EXISTS documents (
95
+ document_id TEXT PRIMARY KEY,
96
+ filename TEXT NOT NULL,
97
+ file_path TEXT NOT NULL,
98
+ file_type TEXT NOT NULL,
99
+ upload_timestamp TEXT NOT NULL,
100
+ processing_status TEXT NOT NULL,
101
+ total_chunks INTEGER DEFAULT 0,
102
+ file_size INTEGER DEFAULT 0,
103
+ checksum TEXT,
104
+ error_message TEXT,
105
+ processing_time REAL,
106
+ metadata_json TEXT,
107
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
108
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP
109
+ )
110
+ ''')
111
+
112
+ # Create citations table
113
+ cursor.execute('''
114
+ CREATE TABLE IF NOT EXISTS citations (
115
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
116
+ chunk_id TEXT NOT NULL,
117
+ document_id TEXT NOT NULL,
118
+ source_document TEXT NOT NULL,
119
+ location_reference TEXT NOT NULL,
120
+ extraction_method TEXT NOT NULL,
121
+ confidence_level REAL NOT NULL,
122
+ page_number INTEGER,
123
+ worksheet_name TEXT,
124
+ cell_range TEXT,
125
+ section_title TEXT,
126
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
127
+ FOREIGN KEY (document_id) REFERENCES documents (document_id)
128
+ )
129
+ ''')
130
+
131
+ # Create indexes for efficient querying
132
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_documents_status ON documents (processing_status)')
133
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_documents_type ON documents (file_type)')
134
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_citations_document ON citations (document_id)')
135
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_citations_chunk ON citations (chunk_id)')
136
+
137
+ conn.commit()
138
+ logger.debug("Database tables initialized successfully")
139
+
140
+ except Exception as e:
141
+ logger.error(f"Failed to initialize database: {e}")
142
+ raise
143
+
144
+ def store_document_metadata(self, doc_id: str, metadata: DocumentMetadata) -> bool:
145
+ """
146
+ Store document metadata in the database.
147
+
148
+ Args:
149
+ doc_id: Document ID
150
+ metadata: DocumentMetadata object
151
+
152
+ Returns:
153
+ True if successful, False otherwise
154
+ """
155
+ try:
156
+ with sqlite3.connect(self.db_path) as conn:
157
+ cursor = conn.cursor()
158
+
159
+ # Convert datetime to ISO string
160
+ upload_timestamp = metadata.upload_timestamp.isoformat()
161
+
162
+ cursor.execute('''
163
+ INSERT OR REPLACE INTO documents (
164
+ document_id, filename, file_path, file_type, upload_timestamp,
165
+ processing_status, total_chunks, file_size, checksum,
166
+ error_message, processing_time, metadata_json, updated_at
167
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
168
+ ''', (
169
+ doc_id,
170
+ metadata.filename,
171
+ metadata.file_path,
172
+ metadata.file_type,
173
+ upload_timestamp,
174
+ metadata.processing_status.value,
175
+ metadata.total_chunks,
176
+ metadata.file_size,
177
+ metadata.checksum,
178
+ metadata.error_message,
179
+ metadata.processing_time,
180
+ metadata.metadata_json,
181
+ datetime.now().isoformat()
182
+ ))
183
+
184
+ conn.commit()
185
+ logger.debug(f"Stored metadata for document: {doc_id}")
186
+ return True
187
+
188
+ except Exception as e:
189
+ logger.error(f"Failed to store document metadata: {e}")
190
+ return False
191
+
192
+ def get_document_metadata(self, doc_id: str) -> Optional[DocumentMetadata]:
193
+ """
194
+ Retrieve document metadata by ID.
195
+
196
+ Args:
197
+ doc_id: Document ID
198
+
199
+ Returns:
200
+ DocumentMetadata object or None if not found
201
+ """
202
+ try:
203
+ with sqlite3.connect(self.db_path) as conn:
204
+ cursor = conn.cursor()
205
+
206
+ cursor.execute('''
207
+ SELECT document_id, filename, file_path, file_type, upload_timestamp,
208
+ processing_status, total_chunks, file_size, checksum,
209
+ error_message, processing_time, metadata_json
210
+ FROM documents WHERE document_id = ?
211
+ ''', (doc_id,))
212
+
213
+ row = cursor.fetchone()
214
+ if row:
215
+ return DocumentMetadata(
216
+ document_id=row[0],
217
+ filename=row[1],
218
+ file_path=row[2],
219
+ file_type=row[3],
220
+ upload_timestamp=datetime.fromisoformat(row[4]),
221
+ processing_status=ProcessingStatus(row[5]),
222
+ total_chunks=row[6],
223
+ file_size=row[7],
224
+ checksum=row[8],
225
+ error_message=row[9],
226
+ processing_time=row[10],
227
+ metadata_json=row[11]
228
+ )
229
+
230
+ return None
231
+
232
+ except Exception as e:
233
+ logger.error(f"Failed to get document metadata: {e}")
234
+ return None
235
+
236
+ def update_document_status(self, doc_id: str, status: ProcessingStatus,
237
+ error_message: Optional[str] = None,
238
+ processing_time: Optional[float] = None) -> bool:
239
+ """
240
+ Update document processing status.
241
+
242
+ Args:
243
+ doc_id: Document ID
244
+ status: New processing status
245
+ error_message: Optional error message
246
+ processing_time: Optional processing time
247
+
248
+ Returns:
249
+ True if successful, False otherwise
250
+ """
251
+ try:
252
+ with sqlite3.connect(self.db_path) as conn:
253
+ cursor = conn.cursor()
254
+
255
+ cursor.execute('''
256
+ UPDATE documents
257
+ SET processing_status = ?, error_message = ?, processing_time = ?, updated_at = ?
258
+ WHERE document_id = ?
259
+ ''', (
260
+ status.value,
261
+ error_message,
262
+ processing_time,
263
+ datetime.now().isoformat(),
264
+ doc_id
265
+ ))
266
+
267
+ conn.commit()
268
+ logger.debug(f"Updated status for document {doc_id}: {status.value}")
269
+ return True
270
+
271
+ except Exception as e:
272
+ logger.error(f"Failed to update document status: {e}")
273
+ return False
274
+
275
+ def store_citation_info(self, citation: CitationInfo) -> bool:
276
+ """
277
+ Store citation information.
278
+
279
+ Args:
280
+ citation: CitationInfo object
281
+
282
+ Returns:
283
+ True if successful, False otherwise
284
+ """
285
+ try:
286
+ with sqlite3.connect(self.db_path) as conn:
287
+ cursor = conn.cursor()
288
+
289
+ cursor.execute('''
290
+ INSERT OR REPLACE INTO citations (
291
+ chunk_id, document_id, source_document, location_reference,
292
+ extraction_method, confidence_level, page_number,
293
+ worksheet_name, cell_range, section_title
294
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
295
+ ''', (
296
+ citation.chunk_id,
297
+ citation.document_id,
298
+ citation.source_document,
299
+ citation.location_reference,
300
+ citation.extraction_method,
301
+ citation.confidence_level,
302
+ citation.page_number,
303
+ citation.worksheet_name,
304
+ citation.cell_range,
305
+ citation.section_title
306
+ ))
307
+
308
+ conn.commit()
309
+ logger.debug(f"Stored citation for chunk: {citation.chunk_id}")
310
+ return True
311
+
312
+ except Exception as e:
313
+ logger.error(f"Failed to store citation info: {e}")
314
+ return False
315
+
316
+ def get_citation_info(self, chunk_id: str) -> Optional[CitationInfo]:
317
+ """
318
+ Retrieve citation information by chunk ID.
319
+
320
+ Args:
321
+ chunk_id: Chunk ID
322
+
323
+ Returns:
324
+ CitationInfo object or None if not found
325
+ """
326
+ try:
327
+ with sqlite3.connect(self.db_path) as conn:
328
+ cursor = conn.cursor()
329
+
330
+ cursor.execute('''
331
+ SELECT chunk_id, document_id, source_document, location_reference,
332
+ extraction_method, confidence_level, page_number,
333
+ worksheet_name, cell_range, section_title
334
+ FROM citations WHERE chunk_id = ?
335
+ ''', (chunk_id,))
336
+
337
+ row = cursor.fetchone()
338
+ if row:
339
+ return CitationInfo(
340
+ chunk_id=row[0],
341
+ document_id=row[1],
342
+ source_document=row[2],
343
+ location_reference=row[3],
344
+ extraction_method=row[4],
345
+ confidence_level=row[5],
346
+ page_number=row[6],
347
+ worksheet_name=row[7],
348
+ cell_range=row[8],
349
+ section_title=row[9]
350
+ )
351
+
352
+ return None
353
+
354
+ except Exception as e:
355
+ logger.error(f"Failed to get citation info: {e}")
356
+ return None
357
+
358
+ def list_documents(self, status: Optional[ProcessingStatus] = None,
359
+ file_type: Optional[str] = None,
360
+ limit: int = 100) -> List[DocumentMetadata]:
361
+
362
+ try:
363
+ with sqlite3.connect(self.db_path) as conn:
364
+ cursor = conn.cursor()
365
+
366
+ query = '''
367
+ SELECT document_id, filename, file_path, file_type, upload_timestamp,
368
+ processing_status, total_chunks, file_size, checksum,
369
+ error_message, processing_time, metadata_json
370
+ FROM documents
371
+ '''
372
+
373
+ conditions = []
374
+ params = []
375
+
376
+ if status:
377
+ conditions.append('processing_status = ?')
378
+ params.append(status.value)
379
+
380
+ if file_type:
381
+ conditions.append('file_type = ?')
382
+ params.append(file_type)
383
+
384
+ if conditions:
385
+ query += ' WHERE ' + ' AND '.join(conditions)
386
+
387
+ query += ' ORDER BY upload_timestamp DESC LIMIT ?'
388
+ params.append(limit)
389
+
390
+ cursor.execute(query, params)
391
+ rows = cursor.fetchall()
392
+
393
+ documents = []
394
+ for row in rows:
395
+ documents.append(DocumentMetadata(
396
+ document_id=row[0],
397
+ filename=row[1],
398
+ file_path=row[2],
399
+ file_type=row[3],
400
+ upload_timestamp=datetime.fromisoformat(row[4]),
401
+ processing_status=ProcessingStatus(row[5]),
402
+ total_chunks=row[6],
403
+ file_size=row[7],
404
+ checksum=row[8],
405
+ error_message=row[9],
406
+ processing_time=row[10],
407
+ metadata_json=row[11]
408
+ ))
409
+
410
+ return documents
411
+
412
+ except Exception as e:
413
+ logger.error(f"Failed to list documents: {e}")
414
+ return []
415
+
416
+ def delete_document(self, doc_id: str) -> bool:
417
+ try:
418
+ with sqlite3.connect(self.db_path) as conn:
419
+ cursor = conn.cursor()
420
+
421
+ # Delete citations first (foreign key constraint)
422
+ cursor.execute('DELETE FROM citations WHERE document_id = ?', (doc_id,))
423
+
424
+ # Delete document
425
+ cursor.execute('DELETE FROM documents WHERE document_id = ?', (doc_id,))
426
+
427
+ conn.commit()
428
+ logger.info(f"Deleted document and citations: {doc_id}")
429
+ return True
430
+
431
+ except Exception as e:
432
+ logger.error(f"Failed to delete document: {e}")
433
+ return False
434
+
435
+ def get_statistics(self) -> Dict[str, Any]:
436
+ """
437
+ Get database statistics.
438
+
439
+ Returns:
440
+ Dictionary with database statistics
441
+ """
442
+ try:
443
+ with sqlite3.connect(self.db_path) as conn:
444
+ cursor = conn.cursor()
445
+
446
+ # Count documents by status
447
+ cursor.execute('''
448
+ SELECT processing_status, COUNT(*)
449
+ FROM documents
450
+ GROUP BY processing_status
451
+ ''')
452
+ status_counts = dict(cursor.fetchall())
453
+
454
+ # Count documents by type
455
+ cursor.execute('''
456
+ SELECT file_type, COUNT(*)
457
+ FROM documents
458
+ GROUP BY file_type
459
+ ''')
460
+ type_counts = dict(cursor.fetchall())
461
+
462
+ # Total statistics
463
+ cursor.execute('SELECT COUNT(*) FROM documents')
464
+ total_documents = cursor.fetchone()[0]
465
+
466
+ cursor.execute('SELECT COUNT(*) FROM citations')
467
+ total_citations = cursor.fetchone()[0]
468
+
469
+ cursor.execute('SELECT SUM(total_chunks) FROM documents')
470
+ total_chunks = cursor.fetchone()[0] or 0
471
+
472
+ cursor.execute('SELECT SUM(file_size) FROM documents')
473
+ total_file_size = cursor.fetchone()[0] or 0
474
+
475
+ return {
476
+ 'total_documents': total_documents,
477
+ 'total_citations': total_citations,
478
+ 'total_chunks': total_chunks,
479
+ 'total_file_size': total_file_size,
480
+ 'documents_by_status': status_counts,
481
+ 'documents_by_type': type_counts,
482
+ 'database_path': self.db_path
483
+ }
484
+
485
+ except Exception as e:
486
+ logger.error(f"Failed to get statistics: {e}")
487
+ return {'error': str(e)}
488
+
489
+ def cleanup_orphaned_citations(self) -> int:
490
+ """
491
+ Clean up citations that reference non-existent documents.
492
+
493
+ Returns:
494
+ Number of orphaned citations removed
495
+ """
496
+ try:
497
+ with sqlite3.connect(self.db_path) as conn:
498
+ cursor = conn.cursor()
499
+
500
+ cursor.execute('''
501
+ DELETE FROM citations
502
+ WHERE document_id NOT IN (SELECT document_id FROM documents)
503
+ ''')
504
+
505
+ removed_count = cursor.rowcount
506
+ conn.commit()
507
+
508
+ logger.info(f"Cleaned up {removed_count} orphaned citations")
509
+ return removed_count
510
+
511
+ except Exception as e:
512
+ logger.error(f"Failed to cleanup orphaned citations: {e}")
513
+ return 0
514
+
515
+
516
+
517
+ if __name__=="__main__":
518
+ logger.info(f"metadata init ..")
src/pdf_processor.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import fitz # PyMuPDF
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Optional
5
+ import re
6
+ from dataclasses import dataclass
7
+ import os
8
+ import sys
9
+
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+
12
+ from src.document_processor import (
13
+ DocumentProcessor,
14
+ ProcessedDocument,
15
+ DocumentType,
16
+ ProcessingStatus,
17
+ ExtractedImage,
18
+ ExtractedTable,
19
+ DocumentProcessorFactory
20
+ )
21
+
22
+
23
+ try:
24
+ from logger.custom_logger import CustomLoggerTracker
25
+ custom_log = CustomLoggerTracker()
26
+ logger = custom_log.get_logger("excel_processor")
27
+
28
+ except ImportError:
29
+ # Fallback to standard logging if custom logger not available
30
+ logger = logging.getLogger("excel_processor")
31
+
32
+
33
+ @dataclass
34
+ class PDFPageInfo:
35
+ """Information about a PDF page."""
36
+ page_number: int
37
+ width: float
38
+ height: float
39
+ rotation: int
40
+ text_length: int
41
+ image_count: int
42
+ table_count: int
43
+
44
+
45
+ class PDFProcessor(DocumentProcessor):
46
+ """
47
+ PDF document processor using PyMuPDF.
48
+
49
+ This processor extracts text, images, tables, and metadata from PDF files,
50
+ maintaining proper citations with page numbers and section information.
51
+ """
52
+
53
+ def __init__(self, config: Dict[str, Any]):
54
+ """
55
+ Initialize the PDF processor.
56
+
57
+ Args:
58
+ config: Configuration dictionary containing PDF processing settings
59
+ """
60
+ super().__init__(config)
61
+ self.extract_images = config.get('image_processing', True)
62
+ self.extract_tables = config.get('table_extraction', True)
63
+ self.min_table_rows = config.get('min_table_rows', 2)
64
+ self.min_table_cols = config.get('min_table_cols', 2)
65
+ self.image_min_size = config.get('image_min_size', 100) # pixels
66
+
67
+ logger.info(f"PDF processor initialized with image_processing={self.extract_images}, "
68
+ f"table_extraction={self.extract_tables}")
69
+
70
+ def _get_supported_extensions(self) -> List[str]:
71
+ """Get supported file extensions for PDF processor."""
72
+ return ['.pdf']
73
+
74
+ def process_document(self, file_path: str) -> ProcessedDocument:
75
+ """
76
+ Process a PDF document and extract all content.
77
+
78
+ Args:
79
+ file_path: Path to the PDF file
80
+
81
+ Returns:
82
+ ProcessedDocument with extracted content and metadata
83
+
84
+ Raises:
85
+ DocumentProcessingError: If PDF processing fails
86
+ """
87
+ try:
88
+ # Validate file first
89
+ self.validate_file(file_path)
90
+
91
+ # Generate document ID
92
+ document_id = self._generate_document_id(file_path)
93
+
94
+ logger.info(f"Processing PDF document: {file_path}")
95
+
96
+ # Open PDF document
97
+ pdf_document = fitz.open(file_path)
98
+
99
+ try:
100
+ # Extract metadata
101
+ metadata = self._extract_metadata(pdf_document)
102
+
103
+ # Process all pages
104
+ all_text = []
105
+ all_images = []
106
+ all_tables = []
107
+ page_info = []
108
+
109
+ for page_num in range(pdf_document.page_count):
110
+ page = pdf_document[page_num]
111
+
112
+ # Extract text from page
113
+ page_text = self._extract_page_text(page, page_num + 1)
114
+ if page_text.strip():
115
+ all_text.append(f"[Page {page_num + 1}]\n{page_text}")
116
+
117
+ # Extract images if enabled
118
+ if self.extract_images:
119
+ page_images = self._extract_page_images(page, page_num + 1, document_id)
120
+ all_images.extend(page_images)
121
+
122
+ # Extract tables if enabled
123
+ if self.extract_tables:
124
+ page_tables = self._extract_page_tables(page, page_num + 1)
125
+ all_tables.extend(page_tables)
126
+
127
+ # Collect page info
128
+ page_info.append(PDFPageInfo(
129
+ page_number=page_num + 1,
130
+ width=page.rect.width,
131
+ height=page.rect.height,
132
+ rotation=page.rotation,
133
+ text_length=len(page_text),
134
+ image_count=len(page_images) if self.extract_images else 0,
135
+ table_count=len(page_tables) if self.extract_tables else 0
136
+ ))
137
+
138
+ # Combine all text
139
+ full_content = "\n\n".join(all_text)
140
+
141
+ # Update metadata with processing info
142
+ metadata.update({
143
+ 'total_pages': pdf_document.page_count,
144
+ 'total_images': len(all_images),
145
+ 'total_tables': len(all_tables),
146
+ 'total_text_length': len(full_content),
147
+ 'page_info': [
148
+ {
149
+ 'page_number': info.page_number,
150
+ 'width': info.width,
151
+ 'height': info.height,
152
+ 'rotation': info.rotation,
153
+ 'text_length': info.text_length,
154
+ 'image_count': info.image_count,
155
+ 'table_count': info.table_count
156
+ }
157
+ for info in page_info
158
+ ]
159
+ })
160
+
161
+ # Create processed document
162
+ processed_doc = ProcessedDocument(
163
+ document_id=document_id,
164
+ filename=Path(file_path).name,
165
+ file_path=file_path,
166
+ document_type=DocumentType.PDF,
167
+ content=full_content,
168
+ metadata=metadata,
169
+ images=all_images,
170
+ tables=all_tables,
171
+ processing_status=ProcessingStatus.COMPLETED
172
+ )
173
+
174
+ logger.info(f"Successfully processed PDF: {pdf_document.page_count} pages, "
175
+ f"{len(all_images)} images, {len(all_tables)} tables")
176
+
177
+ return processed_doc
178
+
179
+ finally:
180
+ pdf_document.close()
181
+
182
+ except Exception as e:
183
+ logger.error(f"Failed to process PDF {file_path}: {e}")
184
+
185
+ # Create failed document
186
+ document_id = self._generate_document_id(file_path)
187
+ return ProcessedDocument(
188
+ document_id=document_id,
189
+ filename=Path(file_path).name,
190
+ file_path=file_path,
191
+ document_type=DocumentType.PDF,
192
+ content="",
193
+ metadata={},
194
+ processing_status=ProcessingStatus.FAILED,
195
+ error_message=str(e)
196
+ )
197
+
198
+ def _extract_metadata(self, pdf_document: fitz.Document) -> Dict[str, Any]:
199
+ """
200
+ Extract metadata from PDF document.
201
+
202
+ Args:
203
+ pdf_document: PyMuPDF document object
204
+
205
+ Returns:
206
+ Dictionary containing PDF metadata
207
+ """
208
+ metadata = {}
209
+
210
+ try:
211
+ # Get document metadata
212
+ pdf_metadata = pdf_document.metadata
213
+
214
+ # Standard metadata fields
215
+ standard_fields = ['title', 'author', 'subject', 'keywords', 'creator', 'producer']
216
+ for field in standard_fields:
217
+ value = pdf_metadata.get(field, '').strip()
218
+ if value:
219
+ metadata[field] = value
220
+
221
+ # Creation and modification dates
222
+ if 'creationDate' in pdf_metadata:
223
+ metadata['creation_date'] = pdf_metadata['creationDate']
224
+ if 'modDate' in pdf_metadata:
225
+ metadata['modification_date'] = pdf_metadata['modDate']
226
+
227
+ # Document properties
228
+ metadata['page_count'] = pdf_document.page_count
229
+ metadata['is_encrypted'] = pdf_document.is_encrypted
230
+ metadata['is_pdf'] = pdf_document.is_pdf
231
+
232
+ # PDF version
233
+ if hasattr(pdf_document, 'pdf_version'):
234
+ metadata['pdf_version'] = pdf_document.pdf_version()
235
+
236
+ except Exception as e:
237
+ logger.warning(f"Failed to extract PDF metadata: {e}")
238
+ metadata['metadata_extraction_error'] = str(e)
239
+
240
+ return metadata
241
+
242
+ def _extract_page_text(self, page: fitz.Page, page_number: int) -> str:
243
+ """
244
+ Extract text from a PDF page.
245
+
246
+ Args:
247
+ page: PyMuPDF page object
248
+ page_number: Page number (1-based)
249
+
250
+ Returns:
251
+ Extracted text content
252
+ """
253
+ try:
254
+ # Extract text with layout preservation
255
+ text = page.get_text("text")
256
+
257
+ # Clean up text
258
+ text = self._clean_text(text)
259
+
260
+ return text
261
+
262
+ except Exception as e:
263
+ logger.warning(f"Failed to extract text from page {page_number}: {e}")
264
+ return ""
265
+
266
+ def _extract_page_images(self, page: fitz.Page, page_number: int, document_id: str) -> List[ExtractedImage]:
267
+ """
268
+ Extract images from a PDF page.
269
+
270
+ Args:
271
+ page: PyMuPDF page object
272
+ page_number: Page number (1-based)
273
+ document_id: Document ID for image naming
274
+
275
+ Returns:
276
+ List of ExtractedImage objects
277
+ """
278
+ images = []
279
+
280
+ try:
281
+ # Get image list from page
282
+ image_list = page.get_images()
283
+
284
+ for img_index, img in enumerate(image_list):
285
+ try:
286
+ # Get image reference
287
+ xref = img[0]
288
+
289
+ # Extract image data
290
+ base_image = page.parent.extract_image(xref)
291
+ image_bytes = base_image["image"]
292
+ image_ext = base_image["ext"]
293
+
294
+ # Check image size
295
+ if len(image_bytes) < self.image_min_size:
296
+ continue
297
+
298
+ # Create image object
299
+ image_id = f"{document_id}_page{page_number}_img{img_index}"
300
+ filename = f"page{page_number}_image{img_index}.{image_ext}"
301
+
302
+ extracted_image = ExtractedImage(
303
+ image_id=image_id,
304
+ filename=filename,
305
+ content=image_bytes,
306
+ format=image_ext.upper(),
307
+ extraction_method="pymupdf",
308
+ metadata={
309
+ 'page_number': page_number,
310
+ 'image_index': img_index,
311
+ 'xref': xref,
312
+ 'size_bytes': len(image_bytes)
313
+ }
314
+ )
315
+
316
+ images.append(extracted_image)
317
+
318
+ except Exception as e:
319
+ logger.warning(f"Failed to extract image {img_index} from page {page_number}: {e}")
320
+ continue
321
+
322
+ except Exception as e:
323
+ logger.warning(f"Failed to extract images from page {page_number}: {e}")
324
+
325
+ return images
326
+
327
+ def _extract_page_tables(self, page: fitz.Page, page_number: int) -> List[ExtractedTable]:
328
+ """
329
+ Extract tables from a PDF page.
330
+
331
+ Args:
332
+ page: PyMuPDF page object
333
+ page_number: Page number (1-based)
334
+
335
+ Returns:
336
+ List of ExtractedTable objects
337
+ """
338
+ tables = []
339
+
340
+ try:
341
+ # Try to find tables using text analysis
342
+ # This is a basic implementation - more sophisticated table detection
343
+ # could use libraries like camelot-py or tabula-py
344
+
345
+ text = page.get_text("text")
346
+ potential_tables = self._detect_tables_in_text(text, page_number)
347
+ tables.extend(potential_tables)
348
+
349
+ except Exception as e:
350
+ logger.warning(f"Failed to extract tables from page {page_number}: {e}")
351
+
352
+ return tables
353
+
354
+ def _detect_tables_in_text(self, text: str, page_number: int) -> List[ExtractedTable]:
355
+ """
356
+ Detect tables in text using pattern matching.
357
+
358
+ This is a basic implementation that looks for tabular patterns in text.
359
+ For production use, consider using specialized table extraction libraries.
360
+
361
+ Args:
362
+ text: Text content to analyze
363
+ page_number: Page number for metadata
364
+
365
+ Returns:
366
+ List of detected tables
367
+ """
368
+ tables = []
369
+
370
+ try:
371
+ lines = text.split('\n')
372
+ current_table_lines = []
373
+
374
+ for line in lines:
375
+ line = line.strip()
376
+ if not line:
377
+ # Empty line might end a table
378
+ if len(current_table_lines) >= self.min_table_rows:
379
+ table = self._parse_table_lines(current_table_lines, page_number, len(tables))
380
+ if table:
381
+ tables.append(table)
382
+ current_table_lines = []
383
+ continue
384
+
385
+ # Check if line looks like a table row (has multiple columns separated by whitespace)
386
+ columns = re.split(r'\s{2,}', line) # Split on 2+ spaces
387
+ if len(columns) >= self.min_table_cols:
388
+ current_table_lines.append(columns)
389
+ else:
390
+ # Line doesn't look like table data
391
+ if len(current_table_lines) >= self.min_table_rows:
392
+ table = self._parse_table_lines(current_table_lines, page_number, len(tables))
393
+ if table:
394
+ tables.append(table)
395
+ current_table_lines = []
396
+
397
+ # Check for table at end of text
398
+ if len(current_table_lines) >= self.min_table_rows:
399
+ table = self._parse_table_lines(current_table_lines, page_number, len(tables))
400
+ if table:
401
+ tables.append(table)
402
+
403
+ except Exception as e:
404
+ logger.warning(f"Failed to detect tables in text: {e}")
405
+
406
+ return tables
407
+
408
+ def _parse_table_lines(self, table_lines: List[List[str]], page_number: int, table_index: int) -> Optional[ExtractedTable]:
409
+ """
410
+ Parse table lines into an ExtractedTable object.
411
+
412
+ Args:
413
+ table_lines: List of table rows (each row is a list of columns)
414
+ page_number: Page number for metadata
415
+ table_index: Table index on the page
416
+
417
+ Returns:
418
+ ExtractedTable object or None if parsing fails
419
+ """
420
+ try:
421
+ if not table_lines:
422
+ return None
423
+
424
+ # Use first row as headers (this is a simple assumption)
425
+ headers = [col.strip() for col in table_lines[0]]
426
+
427
+ # Remaining rows are data
428
+ rows = []
429
+ for row_data in table_lines[1:]:
430
+ # Pad row to match header length
431
+ padded_row = row_data + [''] * (len(headers) - len(row_data))
432
+ rows.append([col.strip() for col in padded_row[:len(headers)]])
433
+
434
+ # Create table object
435
+ table_id = f"page{page_number}_table{table_index}"
436
+
437
+ return ExtractedTable(
438
+ table_id=table_id,
439
+ headers=headers,
440
+ rows=rows,
441
+ page_number=page_number,
442
+ extraction_confidence=0.7, # Basic text-based extraction
443
+ metadata={
444
+ 'extraction_method': 'text_pattern_matching',
445
+ 'table_index': table_index
446
+ }
447
+ )
448
+
449
+ except Exception as e:
450
+ logger.warning(f"Failed to parse table lines: {e}")
451
+ return None
452
+
453
+ def _clean_text(self, text: str) -> str:
454
+ """
455
+ Clean and normalize extracted text.
456
+
457
+ Args:
458
+ text: Raw extracted text
459
+
460
+ Returns:
461
+ Cleaned text
462
+ """
463
+ if not text:
464
+ return ""
465
+
466
+ # Remove excessive whitespace
467
+ text = re.sub(r'\n\s*\n', '\n\n', text) # Multiple newlines to double newline
468
+ text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs to single space
469
+
470
+ # Remove page breaks and form feeds
471
+ text = text.replace('\f', '\n')
472
+ text = text.replace('\x0c', '\n')
473
+
474
+ # Strip leading/trailing whitespace
475
+ text = text.strip()
476
+
477
+ return text
478
+
479
+
480
+ # Register the PDF processor
481
+ DocumentProcessorFactory.register_processor(DocumentType.PDF, PDFProcessor)
482
+
483
+
484
+
485
+
486
+ if __name__=="__main__":
487
+ logger.info(f"PDF processor init ..")
488
+
489
+ ## Test code (for demonstration purposes)
490
+ config = {'image_processing': True, 'table_extraction': True}
491
+ processor = DocumentProcessorFactory.create_processor("/Users/ahmedmostafa/Downloads/eval_Korean_qa/data/documents/원재료사용현황.pdf", config)
492
+ processed_doc = processor.process_document("/Users/ahmedmostafa/Downloads/eval_Korean_qa/data/documents/원재료사용현황.pdf")
493
+ chunks = processor.extract_chunks(processed_doc)
494
+ for chunk in chunks:
495
+ print(chunk)