Spaces:
Running
Running
File size: 7,024 Bytes
c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
"""
General utility functions for historical OCR processing.
"""
import os
import base64
import hashlib
import time
import logging
from datetime import datetime
from pathlib import Path
from functools import wraps
# Configure logging
logger = logging.getLogger("utils")
logger.setLevel(logging.INFO)
def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
"""
Generate a cache key for OCR processing
Args:
file_bytes: File content as bytes
file_type: Type of file (pdf or image)
use_vision: Whether to use vision model
preprocessing_options: Dictionary of preprocessing options
pdf_rotation: PDF rotation value
custom_prompt: Custom prompt for OCR
Returns:
str: Cache key
"""
# Generate file hash
file_hash = hashlib.md5(file_bytes).hexdigest()
# Include preprocessing options in cache key
preprocessing_options_hash = ""
if preprocessing_options:
# Add pdf_rotation to preprocessing options to ensure it's part of the cache key
if pdf_rotation != 0:
preprocessing_options_with_rotation = preprocessing_options.copy()
preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
else:
preprocessing_str = str(sorted(preprocessing_options.items()))
preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
elif pdf_rotation != 0:
# If no preprocessing options but we have rotation, include that in the hash
preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
# Create base cache key
cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
# Include custom prompt in cache key if provided
if custom_prompt:
custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
cache_key = f"{cache_key}_{custom_prompt_hash}"
return cache_key
def timing(description):
"""Context manager for timing code execution"""
class TimingContext:
def __init__(self, description):
self.description = description
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
end_time = time.time()
execution_time = end_time - self.start_time
logger.info(f"{self.description} took {execution_time:.2f} seconds")
return False
return TimingContext(description)
def format_timestamp(timestamp=None, for_filename=False):
"""
Format timestamp for display or filenames
Args:
timestamp: Datetime object or string to format (defaults to current time)
for_filename: Whether to format for use in a filename (defaults to False)
Returns:
str: Formatted timestamp
"""
if timestamp is None:
timestamp = datetime.now()
elif isinstance(timestamp, str):
try:
timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
except ValueError:
timestamp = datetime.now()
if for_filename:
# Format suitable for filenames: "Apr 30, 2025"
return timestamp.strftime("%b %d, %Y")
else:
# Standard format for display
return timestamp.strftime("%Y-%m-%d %H:%M")
def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
"""
Create a user-friendly descriptive filename for the result
Args:
original_filename: Original filename
result: OCR result dictionary
file_ext: File extension
preprocessing_options: Dictionary of preprocessing options
Returns:
str: Human-readable descriptive filename
"""
# Get base name without extension and capitalize words
original_name = Path(original_filename).stem
# Make the original name more readable by replacing dashes and underscores with spaces
# Then capitalize each word
readable_name = original_name.replace('-', ' ').replace('_', ' ')
# Split by spaces and capitalize each word, then rejoin
name_parts = readable_name.split()
readable_name = ' '.join(word.capitalize() for word in name_parts)
# Determine document type
doc_type = None
if 'detected_document_type' in result and result['detected_document_type']:
doc_type = result['detected_document_type'].capitalize()
elif 'topics' in result and result['topics']:
# Use first topic as document type if not explicitly detected
doc_type = result['topics'][0]
# Find period/era information
period_info = None
if 'topics' in result and result['topics']:
for tag in result['topics']:
if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
period_info = tag
break
# Format metadata within parentheses if available
metadata = []
if doc_type:
metadata.append(doc_type)
if period_info:
metadata.append(period_info)
metadata_str = ""
if metadata:
metadata_str = f" ({', '.join(metadata)})"
# Add current date for uniqueness and sorting
current_date = format_timestamp(for_filename=True)
date_str = f" - {current_date}"
# Generate final user-friendly filename
descriptive_name = f"{readable_name}{metadata_str}{date_str}{file_ext}"
return descriptive_name
def extract_subject_tags(result, raw_text, preprocessing_options=None):
"""
Extract subject tags from OCR result
Args:
result: OCR result dictionary
raw_text: Raw text from OCR
preprocessing_options: Dictionary of preprocessing options
Returns:
list: Subject tags
"""
subject_tags = []
# Use existing topics as starting point if available
if 'topics' in result and result['topics']:
subject_tags = list(result['topics'])
# Add document type if detected
if 'detected_document_type' in result:
doc_type = result['detected_document_type'].capitalize()
if doc_type not in subject_tags:
subject_tags.append(doc_type)
# If no tags were found, add some defaults
if not subject_tags:
subject_tags = ["Document", "Historical Document"]
# Try to infer content type
if "letter" in raw_text.lower()[:1000] or "dear" in raw_text.lower()[:200]:
subject_tags.append("Letter")
# Check if it might be a newspaper
if "newspaper" in raw_text.lower()[:1000] or "editor" in raw_text.lower()[:500]:
subject_tags.append("Newspaper")
return subject_tags
|