| """
|
| Vectorless RAG on Structured PDFs
|
| ──────────────────────────────────
|
| Public package exports for clean imports when using as a library.
|
|
|
| Supports both text-based and scanned PDFs with OCR.
|
|
|
| Usage:
|
| from src import PDFParser, ParsedDocument
|
| from src import StructuredChunker, BM25Retriever
|
| from src import VectorlessRAGPipeline
|
| """
|
|
|
|
|
| from .pdf_parser import (
|
| PDFParser,
|
| ParsedDocument,
|
| ParsedPage,
|
| TextBlock,
|
| Heading,
|
| TableData,
|
| DocumentMetadata,
|
|
|
| OCR_PRESETS,
|
| OCR_LANGUAGES,
|
| TextCleaner,
|
| )
|
|
|
|
|
| try:
|
| from .chunker import (
|
| StructuredChunker,
|
| Chunk,
|
| ChunkType,
|
| )
|
| except ImportError:
|
|
|
| import warnings
|
| warnings.warn("chunker module not found. Install or create chunker.py")
|
| StructuredChunker = None
|
| Chunk = None
|
| ChunkType = None
|
|
|
|
|
| try:
|
| from .retriever import (
|
| BM25Retriever,
|
| RetrievalResult,
|
| )
|
| except ImportError:
|
|
|
| import warnings
|
| warnings.warn("retriever module not found. Install or create retriever.py")
|
| BM25Retriever = None
|
| RetrievalResult = None
|
|
|
|
|
| try:
|
| from .rag_pipeline import (
|
| VectorlessRAGPipeline,
|
| RAGResponse,
|
| Citation,
|
| )
|
| except ImportError:
|
|
|
| import warnings
|
| warnings.warn("rag_pipeline module not found. Install or create rag_pipeline.py")
|
| VectorlessRAGPipeline = None
|
| RAGResponse = None
|
| Citation = None
|
|
|
|
|
| __version__ = "2.0.0"
|
| __author__ = "Vectorless RAG Team"
|
| __description__ = "PDF parsing with OCR support for scanned documents"
|
|
|
|
|
| def create_parser(ocr_quality: str = "BALANCED",
|
| ocr_language: str = "eng",
|
| parallel_processing: bool = True,
|
| max_workers: int = 4) -> PDFParser:
|
| """
|
| Create a configured PDF parser with OCR settings.
|
|
|
| Args:
|
| ocr_quality: "FAST", "BALANCED", "HIGH_QUALITY", "VERY_HIGH", "MAXIMUM"
|
| ocr_language: OCR language (e.g., "eng", "eng+fra", "hin+eng")
|
| parallel_processing: Enable parallel OCR processing
|
| max_workers: Number of parallel workers
|
|
|
| Returns:
|
| Configured PDFParser instance
|
| """
|
| return PDFParser(
|
| ocr_quality=ocr_quality,
|
| ocr_language=ocr_language,
|
| parallel_processing=parallel_processing,
|
| max_workers=max_workers
|
| )
|
|
|
|
|
|
|
| def list_ocr_languages():
|
| """
|
| Print available OCR languages.
|
| """
|
| print("\n🌐 Available OCR Languages:")
|
| print("-" * 40)
|
| for code, name in OCR_LANGUAGES.items():
|
| print(f" {code:10} - {name}")
|
| print("\n💡 Use '+' for multiple languages: eng+fra+deu")
|
|
|
|
|
|
|
| def list_quality_presets():
|
| """
|
| Print available OCR quality presets.
|
| """
|
| print("\n📷 OCR Quality Presets:")
|
| print("-" * 50)
|
| for preset, config in OCR_PRESETS.items():
|
| print(f" {preset:12} - {config['description']}")
|
| print(f" DPI: {config['dpi']}, Timeout: {config['timeout']}s")
|
| print("\n💡 Higher quality = slower processing")
|
|
|
|
|
|
|
| def info():
|
| """
|
| Print module information and capabilities.
|
| """
|
| print("=" * 60)
|
| print("Vectorless RAG - PDF Processing Module")
|
| print("=" * 60)
|
| print(f"Version: {__version__}")
|
| print(f"Description: {__description__}")
|
| print("\n✨ Features:")
|
| print(" • Text-based PDF extraction (fast)")
|
| print(" • Scanned PDF OCR (automatic fallback)")
|
| print(" • Multi-language OCR support")
|
| print(" • Configurable quality presets")
|
| print(" • Parallel processing for speed")
|
| print(" • Heading and table detection")
|
| print("\n🔧 OCR Configuration:")
|
| print(f" • Tesseract path: Configured")
|
| print(f" • Available languages: {len(OCR_LANGUAGES)}")
|
| print(f" • Quality presets: {len(OCR_PRESETS)}")
|
| print("=" * 60)
|
|
|
|
|
|
|
| __all__ = [
|
|
|
| "PDFParser",
|
| "ParsedDocument",
|
| "ParsedPage",
|
| "TextBlock",
|
| "Heading",
|
| "TableData",
|
| "DocumentMetadata",
|
|
|
|
|
| "OCR_PRESETS",
|
| "OCR_LANGUAGES",
|
| "TextCleaner",
|
|
|
|
|
| "StructuredChunker",
|
| "Chunk",
|
| "ChunkType",
|
|
|
|
|
| "BM25Retriever",
|
| "RetrievalResult",
|
|
|
|
|
| "VectorlessRAGPipeline",
|
| "RAGResponse",
|
| "Citation",
|
|
|
|
|
| "create_parser",
|
| "list_ocr_languages",
|
| "list_quality_presets",
|
| "info",
|
|
|
|
|
| "__version__",
|
| "__author__",
|
| "__description__",
|
| ]
|
|
|
|
|
| try:
|
| import pytesseract
|
| import fitz
|
| OCR_READY = True
|
| except ImportError as e:
|
| OCR_READY = False
|
| import warnings
|
| warnings.warn(f"OCR dependencies not fully installed: {e}. Scanned PDFs will not work.")
|
|
|
|
|
| if OCR_READY:
|
| print("✅ Vectorless RAG module loaded (OCR ready)")
|
| else:
|
| print("⚠️ Vectorless RAG module loaded (OCR not available)") |