Spaces:
Running
Running
File size: 7,442 Bytes
3dd2ff2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
"""
Specialized handler for letterhead and marginalia documents.
Enhances OCR quality by providing document-specific prompts for common layouts.
"""
import re
import logging
from pathlib import Path
from typing import Union, Dict, Any, Optional, List
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
"""
Detect if an image is likely a letterhead document with marginalia.
Uses path/filename patterns and optional image features (if provided).
Args:
image_path: Path to the image file
features: Optional dict of image features from preprocessing
Returns:
bool: True if likely a letterhead document
"""
# Convert to string path for pattern matching
path_str = str(image_path).lower()
# Check for common letterhead filename patterns
letterhead_patterns = [
r'letter(head)?[^/]*\.jpg',
r'hotel[^/]*\.jpg',
r'baldwin.*\.jpg',
r'business.*letter.*\.jpg',
r'correspondence.*\.jpg'
]
for pattern in letterhead_patterns:
if re.search(pattern, path_str):
logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
return True
# If features are provided, use them for additional detection
if features:
# Check for ALL CAPS sections that might be marginalia
if features.get('uppercase_sections', 0) > 1:
logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
return True
return False
def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
"""
Generate a specialized prompt for letterhead documents to improve OCR quality.
Args:
image_path: Path to the image file
features: Optional dict of image features from preprocessing
Returns:
str: Custom prompt for letterhead OCR or None if not applicable
"""
if not is_likely_letterhead(image_path, features):
return None
# Path-specific customizations for known problematic documents
path_str = str(image_path).lower()
# Most specialized prompt for baldwin documents
if "baldwin" in path_str:
return """
This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
1. Identify and separate the letterhead elements:
- Header: The hotel name, address, and contact information at the top
- Marginalia: The amenities description in ALL CAPS along the margins
2. Extract the main handwritten letter content separately
3. Note any image captions separately
4. Format the output as follows:
- HEADER: [header text]
- MARGINS: [marginalia text]
- LETTER: [handwritten letter text]
- CAPTIONS: [any image captions]
Be careful not to duplicate content between sections, especially with margin text.
"""
# General letterhead prompt
return """
This appears to be a letterhead document. Please extract the text with the following guidelines:
1. Identify the header/letterhead section with company name, logo, address, etc.
2. Identify any margin text or notes that appear separate from the main content
3. Extract the main letter/document body separately
4. Format the output as follows:
- LETTERHEAD: [letterhead text]
- MARGIN_NOTES: [any text in margins]
- BODY: [main document body]
Be careful not to duplicate content between sections.
"""
def clean_letterhead_ocr_output(text: str) -> str:
"""
Clean OCR output from letterhead documents by handling section markers
and reducing duplication.
Args:
text: OCR text from letterhead document
Returns:
str: Cleaned text with proper section formatting
"""
if not text:
return ""
# Find any section markers added by the specialized prompt
section_markers = [
"HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
"LETTER:", "BODY:", "CAPTIONS:"
]
# Check if the text has any section markers
has_sections = any(marker in text for marker in section_markers)
if has_sections:
# Split text into sections while preserving section headers
sections = {}
current_section = "UNKNOWN"
current_text = []
for line in text.split('\n'):
# Check if this line is a section marker
is_marker = False
for marker in section_markers:
if marker in line:
# Save previous section
if current_text:
sections[current_section] = '\n'.join(current_text).strip()
current_text = []
# Start new section
current_section = marker.replace(':', '')
# Keep any text after the marker on this line
remainder = line.split(marker, 1)[1].strip()
if remainder:
current_text.append(remainder)
is_marker = True
break
# If not a marker, add to current section
if not is_marker:
current_text.append(line)
# Save the last section
if current_text:
sections[current_section] = '\n'.join(current_text).strip()
# Format with standard order and clear section headers
formatted_sections = []
# First add letterhead/header info
if "LETTERHEAD" in sections:
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
elif "HEADER" in sections:
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
# Add margins/notes
if "MARGIN_NOTES" in sections:
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
elif "MARGINS" in sections:
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
# Add main content
if "BODY" in sections:
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
elif "LETTER" in sections:
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
# Add captions if present
if "CAPTIONS" in sections:
formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
# Add unknown sections
if "UNKNOWN" in sections and sections["UNKNOWN"]:
formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
# Join everything with clear separation
return "\n\n".join(formatted_sections)
else:
# If no section markers were found, return the original text
return text |