""" Specialized handler for letterhead and marginalia documents. Enhances OCR quality by providing document-specific prompts for common layouts. """ import re import logging from pathlib import Path from typing import Union, Dict, Any, Optional, List # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool: """ Detect if an image is likely a letterhead document with marginalia. Uses path/filename patterns and optional image features (if provided). Args: image_path: Path to the image file features: Optional dict of image features from preprocessing Returns: bool: True if likely a letterhead document """ # Convert to string path for pattern matching path_str = str(image_path).lower() # Check for common letterhead filename patterns letterhead_patterns = [ r'letter(head)?[^/]*\.jpg', r'hotel[^/]*\.jpg', r'baldwin.*\.jpg', r'business.*letter.*\.jpg', r'correspondence.*\.jpg' ] for pattern in letterhead_patterns: if re.search(pattern, path_str): logger.info(f"Detected likely letterhead document: {Path(image_path).name}") return True # If features are provided, use them for additional detection if features: # Check for ALL CAPS sections that might be marginalia if features.get('uppercase_sections', 0) > 1: logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}") return True return False def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]: """ Generate a specialized prompt for letterhead documents to improve OCR quality. Args: image_path: Path to the image file features: Optional dict of image features from preprocessing Returns: str: Custom prompt for letterhead OCR or None if not applicable """ if not is_likely_letterhead(image_path, features): return None # Path-specific customizations for known problematic documents path_str = str(image_path).lower() # Most specialized prompt for baldwin documents if "baldwin" in path_str: return """ This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines: 1. Identify and separate the letterhead elements: - Header: The hotel name, address, and contact information at the top - Marginalia: The amenities description in ALL CAPS along the margins 2. Extract the main handwritten letter content separately 3. Note any image captions separately 4. Format the output as follows: - HEADER: [header text] - MARGINS: [marginalia text] - LETTER: [handwritten letter text] - CAPTIONS: [any image captions] Be careful not to duplicate content between sections, especially with margin text. """ # General letterhead prompt return """ This appears to be a letterhead document. Please extract the text with the following guidelines: 1. Identify the header/letterhead section with company name, logo, address, etc. 2. Identify any margin text or notes that appear separate from the main content 3. Extract the main letter/document body separately 4. Format the output as follows: - LETTERHEAD: [letterhead text] - MARGIN_NOTES: [any text in margins] - BODY: [main document body] Be careful not to duplicate content between sections. """ def clean_letterhead_ocr_output(text: str) -> str: """ Clean OCR output from letterhead documents by handling section markers and reducing duplication. Args: text: OCR text from letterhead document Returns: str: Cleaned text with proper section formatting """ if not text: return "" # Find any section markers added by the specialized prompt section_markers = [ "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:", "LETTER:", "BODY:", "CAPTIONS:" ] # Check if the text has any section markers has_sections = any(marker in text for marker in section_markers) if has_sections: # Split text into sections while preserving section headers sections = {} current_section = "UNKNOWN" current_text = [] for line in text.split('\n'): # Check if this line is a section marker is_marker = False for marker in section_markers: if marker in line: # Save previous section if current_text: sections[current_section] = '\n'.join(current_text).strip() current_text = [] # Start new section current_section = marker.replace(':', '') # Keep any text after the marker on this line remainder = line.split(marker, 1)[1].strip() if remainder: current_text.append(remainder) is_marker = True break # If not a marker, add to current section if not is_marker: current_text.append(line) # Save the last section if current_text: sections[current_section] = '\n'.join(current_text).strip() # Format with standard order and clear section headers formatted_sections = [] # First add letterhead/header info if "LETTERHEAD" in sections: formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}") elif "HEADER" in sections: formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}") # Add margins/notes if "MARGIN_NOTES" in sections: formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}") elif "MARGINS" in sections: formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}") # Add main content if "BODY" in sections: formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}") elif "LETTER" in sections: formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}") # Add captions if present if "CAPTIONS" in sections: formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}") # Add unknown sections if "UNKNOWN" in sections and sections["UNKNOWN"]: formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}") # Join everything with clear separation return "\n\n".join(formatted_sections) else: # If no section markers were found, return the original text return text