Spaces:

milwright
/

historical-ocr

Running

File size: 7,442 Bytes

3dd2ff2

"""
Specialized handler for letterhead and marginalia documents.
Enhances OCR quality by providing document-specific prompts for common layouts.
"""

import re
import logging
from pathlib import Path
from typing import Union, Dict, Any, Optional, List

# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
    """
    Detect if an image is likely a letterhead document with marginalia.
    Uses path/filename patterns and optional image features (if provided).
    
    Args:
        image_path: Path to the image file
        features: Optional dict of image features from preprocessing
        
    Returns:
        bool: True if likely a letterhead document
    """
    # Convert to string path for pattern matching
    path_str = str(image_path).lower()
    
    # Check for common letterhead filename patterns
    letterhead_patterns = [
        r'letter(head)?[^/]*\.jpg',
        r'hotel[^/]*\.jpg',
        r'baldwin.*\.jpg',
        r'business.*letter.*\.jpg',
        r'correspondence.*\.jpg'
    ]
    
    for pattern in letterhead_patterns:
        if re.search(pattern, path_str):
            logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
            return True
    
    # If features are provided, use them for additional detection
    if features:
        # Check for ALL CAPS sections that might be marginalia
        if features.get('uppercase_sections', 0) > 1:
            logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
            return True
    
    return False

def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
    """
    Generate a specialized prompt for letterhead documents to improve OCR quality.
    
    Args:
        image_path: Path to the image file
        features: Optional dict of image features from preprocessing
        
    Returns:
        str: Custom prompt for letterhead OCR or None if not applicable
    """
    if not is_likely_letterhead(image_path, features):
        return None
    
    # Path-specific customizations for known problematic documents
    path_str = str(image_path).lower()
    
    # Most specialized prompt for baldwin documents
    if "baldwin" in path_str:
        return """
        This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
        
        1. Identify and separate the letterhead elements:
           - Header: The hotel name, address, and contact information at the top
           - Marginalia: The amenities description in ALL CAPS along the margins
           
        2. Extract the main handwritten letter content separately
        
        3. Note any image captions separately
        
        4. Format the output as follows:
           - HEADER: [header text]
           - MARGINS: [marginalia text]
           - LETTER: [handwritten letter text]
           - CAPTIONS: [any image captions]
           
        Be careful not to duplicate content between sections, especially with margin text.
        """
    
    # General letterhead prompt
    return """
    This appears to be a letterhead document. Please extract the text with the following guidelines:
    
    1. Identify the header/letterhead section with company name, logo, address, etc.
    2. Identify any margin text or notes that appear separate from the main content
    3. Extract the main letter/document body separately
    4. Format the output as follows:
       - LETTERHEAD: [letterhead text]
       - MARGIN_NOTES: [any text in margins]
       - BODY: [main document body]
       
    Be careful not to duplicate content between sections.
    """

def clean_letterhead_ocr_output(text: str) -> str:
    """
    Clean OCR output from letterhead documents by handling section markers
    and reducing duplication.
    
    Args:
        text: OCR text from letterhead document
        
    Returns:
        str: Cleaned text with proper section formatting
    """
    if not text:
        return ""
    
    # Find any section markers added by the specialized prompt
    section_markers = [
        "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:", 
        "LETTER:", "BODY:", "CAPTIONS:"
    ]
    
    # Check if the text has any section markers
    has_sections = any(marker in text for marker in section_markers)
    
    if has_sections:
        # Split text into sections while preserving section headers
        sections = {}
        current_section = "UNKNOWN"
        current_text = []
        
        for line in text.split('\n'):
            # Check if this line is a section marker
            is_marker = False
            for marker in section_markers:
                if marker in line:
                    # Save previous section
                    if current_text:
                        sections[current_section] = '\n'.join(current_text).strip()
                        current_text = []
                    
                    # Start new section
                    current_section = marker.replace(':', '')
                    # Keep any text after the marker on this line
                    remainder = line.split(marker, 1)[1].strip()
                    if remainder:
                        current_text.append(remainder)
                    is_marker = True
                    break
            
            # If not a marker, add to current section
            if not is_marker:
                current_text.append(line)
        
        # Save the last section
        if current_text:
            sections[current_section] = '\n'.join(current_text).strip()
        
        # Format with standard order and clear section headers
        formatted_sections = []
        
        # First add letterhead/header info
        if "LETTERHEAD" in sections:
            formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
        elif "HEADER" in sections:
            formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
        
        # Add margins/notes
        if "MARGIN_NOTES" in sections:
            formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
        elif "MARGINS" in sections:
            formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
        
        # Add main content
        if "BODY" in sections:
            formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
        elif "LETTER" in sections:
            formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
        
        # Add captions if present
        if "CAPTIONS" in sections:
            formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
        
        # Add unknown sections
        if "UNKNOWN" in sections and sections["UNKNOWN"]:
            formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
        
        # Join everything with clear separation
        return "\n\n".join(formatted_sections)
    else:
        # If no section markers were found, return the original text
        return text