File size: 2,239 Bytes
42dc069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
import os
import streamlit as st
from ocr_processing import process_file

# Mock a file upload
class MockFile:
    def __init__(self, name, content):
        self.name = name
        self._content = content
    
    def getvalue(self):
        return self._content

def test_image(image_path):
    """Test OCR processing for a specific image"""
    print(f"\n\n===== Testing {os.path.basename(image_path)} =====")
    
    # Load the test image
    with open(image_path, 'rb') as f:
        file_bytes = f.read()
    
    # Create mock file
    uploaded_file = MockFile(os.path.basename(image_path), file_bytes)
    
    # Process the file
    result = process_file(uploaded_file)
    
    # Display results summary
    print("\nOCR Content Keys:")
    for key in result['ocr_contents'].keys():
        print(f"- {key}")
    
    # Show a preview of raw_text
    if 'raw_text' in result['ocr_contents']:
        raw_text = result['ocr_contents']['raw_text']
        preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text
        print(f"\nRaw Text Preview: {preview}")
    
    # Check for duplicated content
    found_duplicated = False
    if 'raw_text' in result['ocr_contents']:
        raw_text = result['ocr_contents']['raw_text']
        # Check if the same text appears twice in sequence (a sign of duplication)
        if len(raw_text) > 50:
            half_point = len(raw_text) // 2
            first_quarter = raw_text[:half_point//2].strip()
            if first_quarter and len(first_quarter) > 20:
                if first_quarter in raw_text[half_point:]:
                    found_duplicated = True
                    print("\n⚠️ WARNING: Possible text duplication detected!")
    
    if not found_duplicated:
        print("\n✅ No text duplication detected")
    
    return result

def main():
    # Test with different image types
    test_files = [
        'input/magician-or-bottle-cungerer.jpg',  # The problematic file
        'input/recipe.jpg',                       # Simple text file
        'input/handwritten-letter.jpg'           # Mixed content
    ]
    
    for image_path in test_files:
        test_image(image_path)

if __name__ == "__main__":
    main()