Spaces:
Running
Running
# Standard library imports | |
import os | |
import logging | |
from pathlib import Path | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def is_likely_letterhead(file_path, features=None): | |
""" | |
Determine if a document is likely to contain letterhead or marginalia | |
Args: | |
file_path: Path to the document image | |
features: Optional dictionary of pre-extracted features like text density | |
Returns: | |
bool: True if the document likely contains letterhead, False otherwise | |
""" | |
# Simple logic based on filename for initial version | |
file_name = Path(file_path).name.lower() | |
letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo'] | |
# Check filename for indicators | |
for indicator in letterhead_indicators: | |
if indicator in file_name: | |
logger.info(f"Letterhead detected based on filename: {file_name}") | |
return True | |
# Check features if provided | |
if features: | |
# High text density at the top of the document may indicate letterhead | |
if 'top_density' in features and features['top_density'] > 0.5: | |
logger.info(f"Letterhead detected based on top text density: {features['top_density']}") | |
return True | |
# Uneven text distribution may indicate marginalia | |
if 'density_variance' in features and features['density_variance'] > 0.3: | |
logger.info(f"Possible marginalia detected based on text density variance") | |
return True | |
# Default to standard document | |
return False | |
def get_letterhead_prompt(file_path, features=None): | |
""" | |
Generate a specialized prompt for letterhead document OCR | |
Args: | |
file_path: Path to the document image | |
features: Optional dictionary of pre-extracted features | |
Returns: | |
str: Specialized prompt for letterhead document OCR | |
""" | |
# Base prompt for all letterhead documents | |
base_prompt = ("This document appears to be a letter or includes letterhead elements. " | |
"Please extract the following components separately if present:\n" | |
"1. Letterhead (header with logo, organization name, address, etc.)\n" | |
"2. Date\n" | |
"3. Recipient information (address, name, title)\n" | |
"4. Salutation (e.g., 'Dear Sir/Madam')\n" | |
"5. Main body text\n" | |
"6. Closing (e.g., 'Sincerely')\n" | |
"7. Signature\n" | |
"8. Any footnotes, marginalia, or annotations\n\n" | |
"Preserve the original formatting and structure as much as possible.") | |
# Enhanced prompts based on features | |
if features: | |
# Extract additional context from features if available | |
if 'is_historical' in features and features['is_historical']: | |
base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older " | |
"letterhead styles, formal language patterns, and period-specific formatting.") | |
if 'has_marginalia' in features and features['has_marginalia']: | |
base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. " | |
"Please extract these separately from the main text and indicate their position.") | |
return base_prompt | |