File size: 5,731 Bytes
4d8a2c2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """
Vectorless RAG on Structured PDFs
ββββββββββββββββββββββββββββββββββ
Public package exports for clean imports when using as a library.
Supports both text-based and scanned PDFs with OCR.
Usage:
from src import PDFParser, ParsedDocument
from src import StructuredChunker, BM25Retriever
from src import VectorlessRAGPipeline
"""
# Core PDF Parser (with OCR support)
from .pdf_parser import (
PDFParser,
ParsedDocument,
ParsedPage,
TextBlock,
Heading,
TableData,
DocumentMetadata,
# OCR Presets and Utilities
OCR_PRESETS,
OCR_LANGUAGES,
TextCleaner,
)
# Chunking Module
try:
from .chunker import (
StructuredChunker,
Chunk,
ChunkType,
)
except ImportError:
# Fallback if chunker doesn't exist yet
import warnings
warnings.warn("chunker module not found. Install or create chunker.py")
StructuredChunker = None
Chunk = None
ChunkType = None
# BM25 Retriever Module
try:
from .retriever import (
BM25Retriever,
RetrievalResult,
)
except ImportError:
# Fallback if retriever doesn't exist yet
import warnings
warnings.warn("retriever module not found. Install or create retriever.py")
BM25Retriever = None
RetrievalResult = None
# RAG Pipeline Module
try:
from .rag_pipeline import (
VectorlessRAGPipeline,
RAGResponse,
Citation,
)
except ImportError:
# Fallback if rag_pipeline doesn't exist yet
import warnings
warnings.warn("rag_pipeline module not found. Install or create rag_pipeline.py")
VectorlessRAGPipeline = None
RAGResponse = None
Citation = None
# Version information
__version__ = "2.0.0"
__author__ = "Vectorless RAG Team"
__description__ = "PDF parsing with OCR support for scanned documents"
# Convenience function to create a configured parser
def create_parser(ocr_quality: str = "BALANCED",
ocr_language: str = "eng",
parallel_processing: bool = True,
max_workers: int = 4) -> PDFParser:
"""
Create a configured PDF parser with OCR settings.
Args:
ocr_quality: "FAST", "BALANCED", "HIGH_QUALITY", "VERY_HIGH", "MAXIMUM"
ocr_language: OCR language (e.g., "eng", "eng+fra", "hin+eng")
parallel_processing: Enable parallel OCR processing
max_workers: Number of parallel workers
Returns:
Configured PDFParser instance
"""
return PDFParser(
ocr_quality=ocr_quality,
ocr_language=ocr_language,
parallel_processing=parallel_processing,
max_workers=max_workers
)
# List available OCR languages
def list_ocr_languages():
"""
Print available OCR languages.
"""
print("\nπ Available OCR Languages:")
print("-" * 40)
for code, name in OCR_LANGUAGES.items():
print(f" {code:10} - {name}")
print("\nπ‘ Use '+' for multiple languages: eng+fra+deu")
# List OCR quality presets
def list_quality_presets():
"""
Print available OCR quality presets.
"""
print("\nπ· OCR Quality Presets:")
print("-" * 50)
for preset, config in OCR_PRESETS.items():
print(f" {preset:12} - {config['description']}")
print(f" DPI: {config['dpi']}, Timeout: {config['timeout']}s")
print("\nπ‘ Higher quality = slower processing")
# Module info
def info():
"""
Print module information and capabilities.
"""
print("=" * 60)
print("Vectorless RAG - PDF Processing Module")
print("=" * 60)
print(f"Version: {__version__}")
print(f"Description: {__description__}")
print("\n⨠Features:")
print(" β’ Text-based PDF extraction (fast)")
print(" β’ Scanned PDF OCR (automatic fallback)")
print(" β’ Multi-language OCR support")
print(" β’ Configurable quality presets")
print(" β’ Parallel processing for speed")
print(" β’ Heading and table detection")
print("\nπ§ OCR Configuration:")
print(f" β’ Tesseract path: Configured")
print(f" β’ Available languages: {len(OCR_LANGUAGES)}")
print(f" β’ Quality presets: {len(OCR_PRESETS)}")
print("=" * 60)
# Define what gets imported with "from src import *"
__all__ = [
# Core Parser
"PDFParser",
"ParsedDocument",
"ParsedPage",
"TextBlock",
"Heading",
"TableData",
"DocumentMetadata",
# OCR Utilities
"OCR_PRESETS",
"OCR_LANGUAGES",
"TextCleaner",
# Chunker (if available)
"StructuredChunker",
"Chunk",
"ChunkType",
# Retriever (if available)
"BM25Retriever",
"RetrievalResult",
# Pipeline (if available)
"VectorlessRAGPipeline",
"RAGResponse",
"Citation",
# Convenience functions
"create_parser",
"list_ocr_languages",
"list_quality_presets",
"info",
# Version
"__version__",
"__author__",
"__description__",
]
# Check OCR availability on import
try:
import pytesseract
import fitz
OCR_READY = True
except ImportError as e:
OCR_READY = False
import warnings
warnings.warn(f"OCR dependencies not fully installed: {e}. Scanned PDFs will not work.")
# Print status message (optional, comment out if not needed)
if OCR_READY:
print("β
Vectorless RAG module loaded (OCR ready)")
else:
print("β οΈ Vectorless RAG module loaded (OCR not available)") |