Spaces:

milwright
/

historical-ocr

Running

File size: 3,542 Bytes

94e74f0

# Standard library imports
import os
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def is_likely_letterhead(file_path, features=None):
    """
    Determine if a document is likely to contain letterhead or marginalia
    
    Args:
        file_path: Path to the document image
        features: Optional dictionary of pre-extracted features like text density
        
    Returns:
        bool: True if the document likely contains letterhead, False otherwise
    """
    # Simple logic based on filename for initial version
    file_name = Path(file_path).name.lower()
    letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo']
    
    # Check filename for indicators
    for indicator in letterhead_indicators:
        if indicator in file_name:
            logger.info(f"Letterhead detected based on filename: {file_name}")
            return True
    
    # Check features if provided
    if features:
        # High text density at the top of the document may indicate letterhead
        if 'top_density' in features and features['top_density'] > 0.5:
            logger.info(f"Letterhead detected based on top text density: {features['top_density']}")
            return True
        
        # Uneven text distribution may indicate marginalia
        if 'density_variance' in features and features['density_variance'] > 0.3:
            logger.info(f"Possible marginalia detected based on text density variance")
            return True
    
    # Default to standard document
    return False

def get_letterhead_prompt(file_path, features=None):
    """
    Generate a specialized prompt for letterhead document OCR
    
    Args:
        file_path: Path to the document image
        features: Optional dictionary of pre-extracted features
        
    Returns:
        str: Specialized prompt for letterhead document OCR
    """
    # Base prompt for all letterhead documents
    base_prompt = ("This document appears to be a letter or includes letterhead elements. "
                  "Please extract the following components separately if present:\n"
                  "1. Letterhead (header with logo, organization name, address, etc.)\n"
                  "2. Date\n"
                  "3. Recipient information (address, name, title)\n"
                  "4. Salutation (e.g., 'Dear Sir/Madam')\n"
                  "5. Main body text\n"
                  "6. Closing (e.g., 'Sincerely')\n"
                  "7. Signature\n"
                  "8. Any footnotes, marginalia, or annotations\n\n"
                  "Preserve the original formatting and structure as much as possible.")
    
    # Enhanced prompts based on features
    if features:
        # Extract additional context from features if available
        if 'is_historical' in features and features['is_historical']:
            base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older "
                           "letterhead styles, formal language patterns, and period-specific formatting.")
        
        if 'has_marginalia' in features and features['has_marginalia']:
            base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. "
                           "Please extract these separately from the main text and indicate their position.")
            
    return base_prompt