File size: 5,731 Bytes
4d8a2c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""

Vectorless RAG on Structured PDFs

──────────────────────────────────

Public package exports for clean imports when using as a library.



Supports both text-based and scanned PDFs with OCR.



Usage:

    from src import PDFParser, ParsedDocument

    from src import StructuredChunker, BM25Retriever

    from src import VectorlessRAGPipeline

"""

# Core PDF Parser (with OCR support)
from .pdf_parser import (
    PDFParser,
    ParsedDocument,
    ParsedPage,
    TextBlock,
    Heading,
    TableData,
    DocumentMetadata,
    # OCR Presets and Utilities
    OCR_PRESETS,
    OCR_LANGUAGES,
    TextCleaner,
)

# Chunking Module
try:
    from .chunker import (
        StructuredChunker,
        Chunk,
        ChunkType,
    )
except ImportError:
    # Fallback if chunker doesn't exist yet
    import warnings
    warnings.warn("chunker module not found. Install or create chunker.py")
    StructuredChunker = None
    Chunk = None
    ChunkType = None

# BM25 Retriever Module
try:
    from .retriever import (
        BM25Retriever,
        RetrievalResult,
    )
except ImportError:
    # Fallback if retriever doesn't exist yet
    import warnings
    warnings.warn("retriever module not found. Install or create retriever.py")
    BM25Retriever = None
    RetrievalResult = None

# RAG Pipeline Module
try:
    from .rag_pipeline import (
        VectorlessRAGPipeline,
        RAGResponse,
        Citation,
    )
except ImportError:
    # Fallback if rag_pipeline doesn't exist yet
    import warnings
    warnings.warn("rag_pipeline module not found. Install or create rag_pipeline.py")
    VectorlessRAGPipeline = None
    RAGResponse = None
    Citation = None

# Version information
__version__ = "2.0.0"
__author__ = "Vectorless RAG Team"
__description__ = "PDF parsing with OCR support for scanned documents"

# Convenience function to create a configured parser
def create_parser(ocr_quality: str = "BALANCED", 

                  ocr_language: str = "eng",

                  parallel_processing: bool = True,

                  max_workers: int = 4) -> PDFParser:
    """

    Create a configured PDF parser with OCR settings.

    

    Args:

        ocr_quality: "FAST", "BALANCED", "HIGH_QUALITY", "VERY_HIGH", "MAXIMUM"

        ocr_language: OCR language (e.g., "eng", "eng+fra", "hin+eng")

        parallel_processing: Enable parallel OCR processing

        max_workers: Number of parallel workers

    

    Returns:

        Configured PDFParser instance

    """
    return PDFParser(
        ocr_quality=ocr_quality,
        ocr_language=ocr_language,
        parallel_processing=parallel_processing,
        max_workers=max_workers
    )


# List available OCR languages
def list_ocr_languages():
    """

    Print available OCR languages.

    """
    print("\n🌐 Available OCR Languages:")
    print("-" * 40)
    for code, name in OCR_LANGUAGES.items():
        print(f"  {code:10} - {name}")
    print("\nπŸ’‘ Use '+' for multiple languages: eng+fra+deu")


# List OCR quality presets
def list_quality_presets():
    """

    Print available OCR quality presets.

    """
    print("\nπŸ“· OCR Quality Presets:")
    print("-" * 50)
    for preset, config in OCR_PRESETS.items():
        print(f"  {preset:12} - {config['description']}")
        print(f"                  DPI: {config['dpi']}, Timeout: {config['timeout']}s")
    print("\nπŸ’‘ Higher quality = slower processing")


# Module info
def info():
    """

    Print module information and capabilities.

    """
    print("=" * 60)
    print("Vectorless RAG - PDF Processing Module")
    print("=" * 60)
    print(f"Version: {__version__}")
    print(f"Description: {__description__}")
    print("\n✨ Features:")
    print("  β€’ Text-based PDF extraction (fast)")
    print("  β€’ Scanned PDF OCR (automatic fallback)")
    print("  β€’ Multi-language OCR support")
    print("  β€’ Configurable quality presets")
    print("  β€’ Parallel processing for speed")
    print("  β€’ Heading and table detection")
    print("\nπŸ”§ OCR Configuration:")
    print(f"  β€’ Tesseract path: Configured")
    print(f"  β€’ Available languages: {len(OCR_LANGUAGES)}")
    print(f"  β€’ Quality presets: {len(OCR_PRESETS)}")
    print("=" * 60)


# Define what gets imported with "from src import *"
__all__ = [
    # Core Parser
    "PDFParser",
    "ParsedDocument",
    "ParsedPage",
    "TextBlock",
    "Heading",
    "TableData",
    "DocumentMetadata",
    
    # OCR Utilities
    "OCR_PRESETS",
    "OCR_LANGUAGES",
    "TextCleaner",
    
    # Chunker (if available)
    "StructuredChunker",
    "Chunk",
    "ChunkType",
    
    # Retriever (if available)
    "BM25Retriever",
    "RetrievalResult",
    
    # Pipeline (if available)
    "VectorlessRAGPipeline",
    "RAGResponse",
    "Citation",
    
    # Convenience functions
    "create_parser",
    "list_ocr_languages",
    "list_quality_presets",
    "info",
    
    # Version
    "__version__",
    "__author__",
    "__description__",
]

# Check OCR availability on import
try:
    import pytesseract
    import fitz
    OCR_READY = True
except ImportError as e:
    OCR_READY = False
    import warnings
    warnings.warn(f"OCR dependencies not fully installed: {e}. Scanned PDFs will not work.")

# Print status message (optional, comment out if not needed)
if OCR_READY:
    print("βœ… Vectorless RAG module loaded (OCR ready)")
else:
    print("⚠️ Vectorless RAG module loaded (OCR not available)")