File size: 10,354 Bytes
94e74f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# Standard library imports
import re
import logging
from difflib import SequenceMatcher
from typing import Tuple, Dict, Any, List, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def detect_duplicate_text_issues(text: str) -> Tuple[bool, Dict[str, Any]]:
    """
    Detect if OCR text has duplication issues often found in handwritten document OCR
    
    Args:
        text: OCR text to analyze
    
    Returns:
        Tuple of (has_duplication_issues, details_dict)
    """
    # Early exit for empty text
    if not text or len(text) < 100:
        return False, {"duplication_rate": 0.0, "details": "Text too short for analysis"}
    
    # Look for repeated line patterns
    lines = text.split('\n')
    line_count = len(lines)
    
    # Basic metrics
    repeated_lines = 0
    duplicate_sections = []
    line_repetition_indices = []
    
    # Check for exact line repetitions
    seen_lines = {}
    for i, line in enumerate(lines):
        # Skip very short lines or empty lines
        stripped = line.strip()
        if len(stripped) < 5:
            continue
            
        if stripped in seen_lines:
            repeated_lines += 1
            line_repetition_indices.append((seen_lines[stripped], i))
        else:
            seen_lines[stripped] = i
    
    # Calculate line repetition rate
    line_repetition_rate = repeated_lines / max(1, line_count)
    
    # Look for longer repeated sections using sequence matcher
    text_blocks = [text[i:i+100] for i in range(0, len(text), 100) if i+100 <= len(text)]
    block_count = len(text_blocks)
    
    repeated_blocks = 0
    for i in range(block_count):
        for j in range(i+1, min(i+10, block_count)):  # Only check nearby blocks for efficiency
            matcher = SequenceMatcher(None, text_blocks[i], text_blocks[j])
            similarity = matcher.ratio()
            if similarity > 0.8:  # High similarity threshold
                repeated_blocks += 1
                duplicate_sections.append((i, j, similarity))
                break
                
    # Calculate block repetition rate
    block_repetition_rate = repeated_blocks / max(1, block_count)
    
    # Combine metrics for overall duplication rate
    duplication_rate = max(line_repetition_rate, block_repetition_rate)
    
    # Detect patterns of repeated words in sequence (common OCR mistake)
    word_pattern = r'\b(\w+)\s+\1\b'
    repeated_words = len(re.findall(word_pattern, text))
    repeated_words_rate = repeated_words / max(1, len(text.split()))
    
    # Update duplication rate with word repetition
    duplication_rate = max(duplication_rate, repeated_words_rate)
    
    # Log detailed analysis
    logger.info(f"OCR duplication analysis: line_repetition={line_repetition_rate:.2f}, "
               f"block_repetition={block_repetition_rate:.2f}, "
               f"word_repetition={repeated_words_rate:.2f}, "
               f"final_rate={duplication_rate:.2f}")
    
    # Determine if this is a serious issue
    has_duplication = duplication_rate > 0.1
    
    # Return detailed results
    return has_duplication, {
        "duplication_rate": duplication_rate,
        "line_repetition_rate": line_repetition_rate,
        "block_repetition_rate": block_repetition_rate,
        "word_repetition_rate": repeated_words_rate,
        "repeated_lines": repeated_lines,
        "repeated_blocks": repeated_blocks,
        "repeated_words": repeated_words,
        "duplicate_sections": duplicate_sections[:10],  # Only include the first 10 for brevity
        "repetition_indices": line_repetition_indices[:10]
    }

def get_enhanced_preprocessing_options(current_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Generate enhanced preprocessing options for improved OCR on handwritten documents
    
    Args:
        current_options: Current preprocessing options (if available)
        
    Returns:
        Dict of enhanced options
    """
    # Start with current options or empty dict
    options = current_options.copy() if current_options else {}
    
    # Set document type to handwritten
    options["document_type"] = "handwritten"
    
    # Enhanced contrast - higher than normal for better handwriting extraction
    options["contrast"] = 1.4  # Higher than default
    
    # Apply grayscale
    options["grayscale"] = True
    
    # Apply adaptive thresholding optimized for handwriting
    options["adaptive_threshold"] = True
    options["threshold_block_size"] = 25  # Larger block size for handwriting
    options["threshold_c"] = 10  # Adjusted C value for better handwriting detection
    
    # Disable standard binarization which often loses handwriting detail
    options["binarize"] = False
    
    # Despeckle to reduce noise
    options["denoise"] = True
    
    # Enable handwriting-specific preprocessing
    options["handwriting_mode"] = True
    
    # Disable anything that might harm handwriting recognition
    if "sharpen" in options:
        options["sharpen"] = False
    
    logger.info(f"Enhanced handwriting preprocessing options generated: {options}")
    return options

def get_handwritten_specific_prompt(current_prompt: Optional[str] = None) -> str:
    """
    Generate a specialized prompt for handwritten document OCR
    
    Args:
        current_prompt: Current prompt (if available)
        
    Returns:
        str: Enhanced prompt for handwritten documents
    """
    # Base prompt for all handwritten documents
    base_prompt = ("This is a handwritten document that requires careful transcription. "
                  "Please transcribe all visible handwritten text, preserving the original "
                  "line breaks, paragraph structure, and any special formatting or indentation. "
                  "Pay special attention to:\n"
                  "1. Words that may be difficult to read due to handwriting style\n"
                  "2. Any crossed-out text (indicate with [crossed out: possible text])\n"
                  "3. Insertions or annotations between lines or in margins\n"
                  "4. Maintain the spatial layout of the text as much as possible\n"
                  "5. If there are multiple columns or non-linear text, preserve the reading order\n\n"
                  "If you cannot read a word with confidence, indicate with [?] or provide your best guess as [word?].")
    
    # If there's an existing prompt, combine them, otherwise just use the base
    if current_prompt:
        # Remove any redundant instructions about handwriting
        lower_prompt = current_prompt.lower()
        if "handwritten" in lower_prompt or "handwriting" in lower_prompt:
            # Extract any unique instructions from the current prompt
            # This logic is simplified and might need improvement
            current_sentences = [s.strip() for s in current_prompt.split('.') if s.strip()]
            handwriting_sentences = [s for s in current_sentences 
                                   if "handwritten" not in s.lower() 
                                   and "handwriting" not in s.lower()]
            
            # Add unique instructions to our base prompt
            if handwriting_sentences:
                combined_prompt = base_prompt + "\n\nAdditional instructions:\n"
                combined_prompt += ". ".join(handwriting_sentences) + "."
                return combined_prompt
        else:
            # If no handwriting instructions in the current prompt, just append it
            return f"{base_prompt}\n\nAdditional context from user:\n{current_prompt}"
    
    return base_prompt

def clean_duplicated_text(text: str) -> str:
    """
    Clean up duplicated text often found in OCR output for handwritten documents
    
    Args:
        text: OCR text to clean
        
    Returns:
        str: Cleaned text with duplications removed
    """
    if not text:
        return text
        
    # Split into lines for line-based deduplication
    lines = text.split('\n')
    
    # Remove consecutive duplicate lines
    deduped_lines = []
    prev_line = None
    
    for line in lines:
        stripped = line.strip()
        # Skip empty lines
        if not stripped:
            if not deduped_lines or deduped_lines[-1].strip():
                deduped_lines.append(line)  # Keep the first empty line
            continue
            
        # Skip if this line is a duplicate of the previous line
        if stripped == prev_line:
            continue
            
        deduped_lines.append(line)
        prev_line = stripped
    
    # Re-join the deduplicated lines
    deduped_text = '\n'.join(deduped_lines)
    
    # Remove repeated words
    word_pattern = r'\b(\w+)\s+\1\b'
    deduped_text = re.sub(word_pattern, r'\1', deduped_text)
    
    # Remove repeated phrases (3+ words)
    # This is a simplified approach and might need improvement
    words = deduped_text.split()
    cleaned_words = []
    i = 0
    
    while i < len(words):
        # Check for phrase repetition (phrases of 3 to 6 words)
        found_repeat = False
        
        for phrase_len in range(3, min(7, len(words) - i)):
            phrase = ' '.join(words[i:i+phrase_len])
            next_pos = i + phrase_len
            
            if next_pos + phrase_len <= len(words):
                next_phrase = ' '.join(words[next_pos:next_pos+phrase_len])
                
                if phrase.lower() == next_phrase.lower():
                    # Found a repeated phrase, skip the second occurrence
                    cleaned_words.extend(words[i:i+phrase_len])
                    i = next_pos + phrase_len
                    found_repeat = True
                    break
        
        if not found_repeat:
            cleaned_words.append(words[i])
            i += 1
    
    # Rejoin the cleaned words
    final_text = ' '.join(cleaned_words)
    
    # Log the cleaning results
    original_len = len(text)
    cleaned_len = len(final_text)
    reduction = 100 * (original_len - cleaned_len) / max(1, original_len)
    
    logger.info(f"Text cleaning: removed {original_len - cleaned_len} chars ({reduction:.1f}% reduction)")
    
    return final_text