Spaces:

milwright
/

historical-ocr

Running

File size: 10,354 Bytes

94e74f0

# Standard library imports
import re
import logging
from difflib import SequenceMatcher
from typing import Tuple, Dict, Any, List, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def detect_duplicate_text_issues(text: str) -> Tuple[bool, Dict[str, Any]]:
    """
    Detect if OCR text has duplication issues often found in handwritten document OCR
    
    Args:
        text: OCR text to analyze
    
    Returns:
        Tuple of (has_duplication_issues, details_dict)
    """
    # Early exit for empty text
    if not text or len(text) < 100:
        return False, {"duplication_rate": 0.0, "details": "Text too short for analysis"}
    
    # Look for repeated line patterns
    lines = text.split('\n')
    line_count = len(lines)
    
    # Basic metrics
    repeated_lines = 0
    duplicate_sections = []
    line_repetition_indices = []
    
    # Check for exact line repetitions
    seen_lines = {}
    for i, line in enumerate(lines):
        # Skip very short lines or empty lines
        stripped = line.strip()
        if len(stripped) < 5:
            continue
            
        if stripped in seen_lines:
            repeated_lines += 1
            line_repetition_indices.append((seen_lines[stripped], i))
        else:
            seen_lines[stripped] = i
    
    # Calculate line repetition rate
    line_repetition_rate = repeated_lines / max(1, line_count)
    
    # Look for longer repeated sections using sequence matcher
    text_blocks = [text[i:i+100] for i in range(0, len(text), 100) if i+100 <= len(text)]
    block_count = len(text_blocks)
    
    repeated_blocks = 0
    for i in range(block_count):
        for j in range(i+1, min(i+10, block_count)):  # Only check nearby blocks for efficiency
            matcher = SequenceMatcher(None, text_blocks[i], text_blocks[j])
            similarity = matcher.ratio()
            if similarity > 0.8:  # High similarity threshold
                repeated_blocks += 1
                duplicate_sections.append((i, j, similarity))
                break
                
    # Calculate block repetition rate
    block_repetition_rate = repeated_blocks / max(1, block_count)
    
    # Combine metrics for overall duplication rate
    duplication_rate = max(line_repetition_rate, block_repetition_rate)
    
    # Detect patterns of repeated words in sequence (common OCR mistake)
    word_pattern = r'\b(\w+)\s+\1\b'
    repeated_words = len(re.findall(word_pattern, text))
    repeated_words_rate = repeated_words / max(1, len(text.split()))
    
    # Update duplication rate with word repetition
    duplication_rate = max(duplication_rate, repeated_words_rate)
    
    # Log detailed analysis
    logger.info(f"OCR duplication analysis: line_repetition={line_repetition_rate:.2f}, "
               f"block_repetition={block_repetition_rate:.2f}, "
               f"word_repetition={repeated_words_rate:.2f}, "
               f"final_rate={duplication_rate:.2f}")
    
    # Determine if this is a serious issue
    has_duplication = duplication_rate > 0.1
    
    # Return detailed results
    return has_duplication, {
        "duplication_rate": duplication_rate,
        "line_repetition_rate": line_repetition_rate,
        "block_repetition_rate": block_repetition_rate,
        "word_repetition_rate": repeated_words_rate,
        "repeated_lines": repeated_lines,
        "repeated_blocks": repeated_blocks,
        "repeated_words": repeated_words,
        "duplicate_sections": duplicate_sections[:10],  # Only include the first 10 for brevity
        "repetition_indices": line_repetition_indices[:10]
    }

def get_enhanced_preprocessing_options(current_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Generate enhanced preprocessing options for improved OCR on handwritten documents
    
    Args:
        current_options: Current preprocessing options (if available)
        
    Returns:
        Dict of enhanced options
    """
    # Start with current options or empty dict
    options = current_options.copy() if current_options else {}
    
    # Set document type to handwritten
    options["document_type"] = "handwritten"
    
    # Enhanced contrast - higher than normal for better handwriting extraction
    options["contrast"] = 1.4  # Higher than default
    
    # Apply grayscale
    options["grayscale"] = True
    
    # Apply adaptive thresholding optimized for handwriting
    options["adaptive_threshold"] = True
    options["threshold_block_size"] = 25  # Larger block size for handwriting
    options["threshold_c"] = 10  # Adjusted C value for better handwriting detection
    
    # Disable standard binarization which often loses handwriting detail
    options["binarize"] = False
    
    # Despeckle to reduce noise
    options["denoise"] = True
    
    # Enable handwriting-specific preprocessing
    options["handwriting_mode"] = True
    
    # Disable anything that might harm handwriting recognition
    if "sharpen" in options:
        options["sharpen"] = False
    
    logger.info(f"Enhanced handwriting preprocessing options generated: {options}")
    return options

def get_handwritten_specific_prompt(current_prompt: Optional[str] = None) -> str:
    """
    Generate a specialized prompt for handwritten document OCR
    
    Args:
        current_prompt: Current prompt (if available)
        
    Returns:
        str: Enhanced prompt for handwritten documents
    """
    # Base prompt for all handwritten documents
    base_prompt = ("This is a handwritten document that requires careful transcription. "
                  "Please transcribe all visible handwritten text, preserving the original "
                  "line breaks, paragraph structure, and any special formatting or indentation. "
                  "Pay special attention to:\n"
                  "1. Words that may be difficult to read due to handwriting style\n"
                  "2. Any crossed-out text (indicate with [crossed out: possible text])\n"
                  "3. Insertions or annotations between lines or in margins\n"
                  "4. Maintain the spatial layout of the text as much as possible\n"
                  "5. If there are multiple columns or non-linear text, preserve the reading order\n\n"
                  "If you cannot read a word with confidence, indicate with [?] or provide your best guess as [word?].")
    
    # If there's an existing prompt, combine them, otherwise just use the base
    if current_prompt:
        # Remove any redundant instructions about handwriting
        lower_prompt = current_prompt.lower()
        if "handwritten" in lower_prompt or "handwriting" in lower_prompt:
            # Extract any unique instructions from the current prompt
            # This logic is simplified and might need improvement
            current_sentences = [s.strip() for s in current_prompt.split('.') if s.strip()]
            handwriting_sentences = [s for s in current_sentences 
                                   if "handwritten" not in s.lower() 
                                   and "handwriting" not in s.lower()]
            
            # Add unique instructions to our base prompt
            if handwriting_sentences:
                combined_prompt = base_prompt + "\n\nAdditional instructions:\n"
                combined_prompt += ". ".join(handwriting_sentences) + "."
                return combined_prompt
        else:
            # If no handwriting instructions in the current prompt, just append it
            return f"{base_prompt}\n\nAdditional context from user:\n{current_prompt}"
    
    return base_prompt

def clean_duplicated_text(text: str) -> str:
    """
    Clean up duplicated text often found in OCR output for handwritten documents
    
    Args:
        text: OCR text to clean
        
    Returns:
        str: Cleaned text with duplications removed
    """
    if not text:
        return text
        
    # Split into lines for line-based deduplication
    lines = text.split('\n')
    
    # Remove consecutive duplicate lines
    deduped_lines = []
    prev_line = None
    
    for line in lines:
        stripped = line.strip()
        # Skip empty lines
        if not stripped:
            if not deduped_lines or deduped_lines[-1].strip():
                deduped_lines.append(line)  # Keep the first empty line
            continue
            
        # Skip if this line is a duplicate of the previous line
        if stripped == prev_line:
            continue
            
        deduped_lines.append(line)
        prev_line = stripped
    
    # Re-join the deduplicated lines
    deduped_text = '\n'.join(deduped_lines)
    
    # Remove repeated words
    word_pattern = r'\b(\w+)\s+\1\b'
    deduped_text = re.sub(word_pattern, r'\1', deduped_text)
    
    # Remove repeated phrases (3+ words)
    # This is a simplified approach and might need improvement
    words = deduped_text.split()
    cleaned_words = []
    i = 0
    
    while i < len(words):
        # Check for phrase repetition (phrases of 3 to 6 words)
        found_repeat = False
        
        for phrase_len in range(3, min(7, len(words) - i)):
            phrase = ' '.join(words[i:i+phrase_len])
            next_pos = i + phrase_len
            
            if next_pos + phrase_len <= len(words):
                next_phrase = ' '.join(words[next_pos:next_pos+phrase_len])
                
                if phrase.lower() == next_phrase.lower():
                    # Found a repeated phrase, skip the second occurrence
                    cleaned_words.extend(words[i:i+phrase_len])
                    i = next_pos + phrase_len
                    found_repeat = True
                    break
        
        if not found_repeat:
            cleaned_words.append(words[i])
            i += 1
    
    # Rejoin the cleaned words
    final_text = ' '.join(cleaned_words)
    
    # Log the cleaning results
    original_len = len(text)
    cleaned_len = len(final_text)
    reduction = 100 * (original_len - cleaned_len) / max(1, original_len)
    
    logger.info(f"Text cleaning: removed {original_len - cleaned_len} chars ({reduction:.1f}% reduction)")
    
    return final_text