Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / verify_fix.py

milwright

Consolidate segmentation improvements and code cleanup

42dc069 20 days ago

raw

history blame

2.24 kB

	#!/usr/bin/env python3
	import os
	import streamlit as st
	from ocr_processing import process_file

	# Mock a file upload
	class MockFile:
	def __init__(self, name, content):
	self.name = name
	self._content = content

	def getvalue(self):
	return self._content

	def test_image(image_path):
	"""Test OCR processing for a specific image"""
	print(f"\n\n===== Testing {os.path.basename(image_path)} =====")

	# Load the test image
	with open(image_path, 'rb') as f:
	file_bytes = f.read()

	# Create mock file
	uploaded_file = MockFile(os.path.basename(image_path), file_bytes)

	# Process the file
	result = process_file(uploaded_file)

	# Display results summary
	print("\nOCR Content Keys:")
	for key in result['ocr_contents'].keys():
	print(f"- {key}")

	# Show a preview of raw_text
	if 'raw_text' in result['ocr_contents']:
	raw_text = result['ocr_contents']['raw_text']
	preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text
	print(f"\nRaw Text Preview: {preview}")

	# Check for duplicated content
	found_duplicated = False
	if 'raw_text' in result['ocr_contents']:
	raw_text = result['ocr_contents']['raw_text']
	# Check if the same text appears twice in sequence (a sign of duplication)
	if len(raw_text) > 50:
	half_point = len(raw_text) // 2
	first_quarter = raw_text[:half_point//2].strip()
	if first_quarter and len(first_quarter) > 20:
	if first_quarter in raw_text[half_point:]:
	found_duplicated = True
	print("\n⚠️ WARNING: Possible text duplication detected!")

	if not found_duplicated:
	print("\n✅ No text duplication detected")

	return result

	def main():
	# Test with different image types
	test_files = [
	'input/magician-or-bottle-cungerer.jpg', # The problematic file
	'input/recipe.jpg', # Simple text file
	'input/handwritten-letter.jpg' # Mixed content
	]

	for image_path in test_files:
	test_image(image_path)

	if __name__ == "__main__":
	main()