File size: 7,442 Bytes
3dd2ff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""
Specialized handler for letterhead and marginalia documents.
Enhances OCR quality by providing document-specific prompts for common layouts.
"""

import re
import logging
from pathlib import Path
from typing import Union, Dict, Any, Optional, List

# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
    """
    Detect if an image is likely a letterhead document with marginalia.
    Uses path/filename patterns and optional image features (if provided).
    
    Args:
        image_path: Path to the image file
        features: Optional dict of image features from preprocessing
        
    Returns:
        bool: True if likely a letterhead document
    """
    # Convert to string path for pattern matching
    path_str = str(image_path).lower()
    
    # Check for common letterhead filename patterns
    letterhead_patterns = [
        r'letter(head)?[^/]*\.jpg',
        r'hotel[^/]*\.jpg',
        r'baldwin.*\.jpg',
        r'business.*letter.*\.jpg',
        r'correspondence.*\.jpg'
    ]
    
    for pattern in letterhead_patterns:
        if re.search(pattern, path_str):
            logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
            return True
    
    # If features are provided, use them for additional detection
    if features:
        # Check for ALL CAPS sections that might be marginalia
        if features.get('uppercase_sections', 0) > 1:
            logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
            return True
    
    return False

def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
    """
    Generate a specialized prompt for letterhead documents to improve OCR quality.
    
    Args:
        image_path: Path to the image file
        features: Optional dict of image features from preprocessing
        
    Returns:
        str: Custom prompt for letterhead OCR or None if not applicable
    """
    if not is_likely_letterhead(image_path, features):
        return None
    
    # Path-specific customizations for known problematic documents
    path_str = str(image_path).lower()
    
    # Most specialized prompt for baldwin documents
    if "baldwin" in path_str:
        return """
        This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
        
        1. Identify and separate the letterhead elements:
           - Header: The hotel name, address, and contact information at the top
           - Marginalia: The amenities description in ALL CAPS along the margins
           
        2. Extract the main handwritten letter content separately
        
        3. Note any image captions separately
        
        4. Format the output as follows:
           - HEADER: [header text]
           - MARGINS: [marginalia text]
           - LETTER: [handwritten letter text]
           - CAPTIONS: [any image captions]
           
        Be careful not to duplicate content between sections, especially with margin text.
        """
    
    # General letterhead prompt
    return """
    This appears to be a letterhead document. Please extract the text with the following guidelines:
    
    1. Identify the header/letterhead section with company name, logo, address, etc.
    2. Identify any margin text or notes that appear separate from the main content
    3. Extract the main letter/document body separately
    4. Format the output as follows:
       - LETTERHEAD: [letterhead text]
       - MARGIN_NOTES: [any text in margins]
       - BODY: [main document body]
       
    Be careful not to duplicate content between sections.
    """

def clean_letterhead_ocr_output(text: str) -> str:
    """
    Clean OCR output from letterhead documents by handling section markers
    and reducing duplication.
    
    Args:
        text: OCR text from letterhead document
        
    Returns:
        str: Cleaned text with proper section formatting
    """
    if not text:
        return ""
    
    # Find any section markers added by the specialized prompt
    section_markers = [
        "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:", 
        "LETTER:", "BODY:", "CAPTIONS:"
    ]
    
    # Check if the text has any section markers
    has_sections = any(marker in text for marker in section_markers)
    
    if has_sections:
        # Split text into sections while preserving section headers
        sections = {}
        current_section = "UNKNOWN"
        current_text = []
        
        for line in text.split('\n'):
            # Check if this line is a section marker
            is_marker = False
            for marker in section_markers:
                if marker in line:
                    # Save previous section
                    if current_text:
                        sections[current_section] = '\n'.join(current_text).strip()
                        current_text = []
                    
                    # Start new section
                    current_section = marker.replace(':', '')
                    # Keep any text after the marker on this line
                    remainder = line.split(marker, 1)[1].strip()
                    if remainder:
                        current_text.append(remainder)
                    is_marker = True
                    break
            
            # If not a marker, add to current section
            if not is_marker:
                current_text.append(line)
        
        # Save the last section
        if current_text:
            sections[current_section] = '\n'.join(current_text).strip()
        
        # Format with standard order and clear section headers
        formatted_sections = []
        
        # First add letterhead/header info
        if "LETTERHEAD" in sections:
            formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
        elif "HEADER" in sections:
            formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
        
        # Add margins/notes
        if "MARGIN_NOTES" in sections:
            formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
        elif "MARGINS" in sections:
            formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
        
        # Add main content
        if "BODY" in sections:
            formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
        elif "LETTER" in sections:
            formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
        
        # Add captions if present
        if "CAPTIONS" in sections:
            formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
        
        # Add unknown sections
        if "UNKNOWN" in sections and sections["UNKNOWN"]:
            formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
        
        # Join everything with clear separation
        return "\n\n".join(formatted_sections)
    else:
        # If no section markers were found, return the original text
        return text