historical-ocr / verify_fix.py
milwright's picture
Consolidate segmentation improvements and code cleanup
42dc069
raw
history blame
2.24 kB
#!/usr/bin/env python3
import os
import streamlit as st
from ocr_processing import process_file
# Mock a file upload
class MockFile:
def __init__(self, name, content):
self.name = name
self._content = content
def getvalue(self):
return self._content
def test_image(image_path):
"""Test OCR processing for a specific image"""
print(f"\n\n===== Testing {os.path.basename(image_path)} =====")
# Load the test image
with open(image_path, 'rb') as f:
file_bytes = f.read()
# Create mock file
uploaded_file = MockFile(os.path.basename(image_path), file_bytes)
# Process the file
result = process_file(uploaded_file)
# Display results summary
print("\nOCR Content Keys:")
for key in result['ocr_contents'].keys():
print(f"- {key}")
# Show a preview of raw_text
if 'raw_text' in result['ocr_contents']:
raw_text = result['ocr_contents']['raw_text']
preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text
print(f"\nRaw Text Preview: {preview}")
# Check for duplicated content
found_duplicated = False
if 'raw_text' in result['ocr_contents']:
raw_text = result['ocr_contents']['raw_text']
# Check if the same text appears twice in sequence (a sign of duplication)
if len(raw_text) > 50:
half_point = len(raw_text) // 2
first_quarter = raw_text[:half_point//2].strip()
if first_quarter and len(first_quarter) > 20:
if first_quarter in raw_text[half_point:]:
found_duplicated = True
print("\n⚠️ WARNING: Possible text duplication detected!")
if not found_duplicated:
print("\n✅ No text duplication detected")
return result
def main():
# Test with different image types
test_files = [
'input/magician-or-bottle-cungerer.jpg', # The problematic file
'input/recipe.jpg', # Simple text file
'input/handwritten-letter.jpg' # Mixed content
]
for image_path in test_files:
test_image(image_path)
if __name__ == "__main__":
main()