Spaces:
Running
Running
File size: 3,542 Bytes
94e74f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# Standard library imports
import os
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def is_likely_letterhead(file_path, features=None):
"""
Determine if a document is likely to contain letterhead or marginalia
Args:
file_path: Path to the document image
features: Optional dictionary of pre-extracted features like text density
Returns:
bool: True if the document likely contains letterhead, False otherwise
"""
# Simple logic based on filename for initial version
file_name = Path(file_path).name.lower()
letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo']
# Check filename for indicators
for indicator in letterhead_indicators:
if indicator in file_name:
logger.info(f"Letterhead detected based on filename: {file_name}")
return True
# Check features if provided
if features:
# High text density at the top of the document may indicate letterhead
if 'top_density' in features and features['top_density'] > 0.5:
logger.info(f"Letterhead detected based on top text density: {features['top_density']}")
return True
# Uneven text distribution may indicate marginalia
if 'density_variance' in features and features['density_variance'] > 0.3:
logger.info(f"Possible marginalia detected based on text density variance")
return True
# Default to standard document
return False
def get_letterhead_prompt(file_path, features=None):
"""
Generate a specialized prompt for letterhead document OCR
Args:
file_path: Path to the document image
features: Optional dictionary of pre-extracted features
Returns:
str: Specialized prompt for letterhead document OCR
"""
# Base prompt for all letterhead documents
base_prompt = ("This document appears to be a letter or includes letterhead elements. "
"Please extract the following components separately if present:\n"
"1. Letterhead (header with logo, organization name, address, etc.)\n"
"2. Date\n"
"3. Recipient information (address, name, title)\n"
"4. Salutation (e.g., 'Dear Sir/Madam')\n"
"5. Main body text\n"
"6. Closing (e.g., 'Sincerely')\n"
"7. Signature\n"
"8. Any footnotes, marginalia, or annotations\n\n"
"Preserve the original formatting and structure as much as possible.")
# Enhanced prompts based on features
if features:
# Extract additional context from features if available
if 'is_historical' in features and features['is_historical']:
base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older "
"letterhead styles, formal language patterns, and period-specific formatting.")
if 'has_marginalia' in features and features['has_marginalia']:
base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. "
"Please extract these separately from the main text and indicate their position.")
return base_prompt
|