historical-ocr / testing /test_json_bleed.py
milwright's picture
Consolidate segmentation improvements and code cleanup
42dc069
"""
Test case to verify the fix for JSON bleed-through in historical text.
"""
import sys
import os
from pathlib import Path
# Add parent directory to path
sys.path.append(str(Path(__file__).parent.parent))
from utils.content_utils import format_structured_data
from utils.text_utils import clean_raw_text, format_markdown_text
# Sample text with JSON-like content (historical text with curly braces)
SAMPLE_TEXT = """# ENGLISH Credulity; or Ye're all Bottled.
O magnus pofldac Inimicis Rifus! Hor. Sat. WITH Grief, Refentment, and averted Eyes, Britannia droops to fee her Sons, (once Wile So fam'd for Arms, for Conduct fo renown'd With ev'ry Virtue ev'ry Glory crown'd) Now fink ignoble, and to nothing fall; Obedient marching forth at Folly's Call.
Text containing curly braces like these: { and } should not be parsed as JSON.
Even this text with a JSON-like pattern {"key": "value"} should be preserved as-is.
"""
def test_format_structured_data():
"""Test that format_structured_data preserves text content"""
result = format_structured_data(SAMPLE_TEXT)
# Verify the text is returned as-is without attempting to parse JSON-like structures
assert result == SAMPLE_TEXT
print("βœ“ format_structured_data correctly preserves text content")
# Make sure the output doesn't have any JSON code blocks
assert "```json" not in result
print("βœ“ format_structured_data does not create JSON code blocks")
return True
if __name__ == "__main__":
# Run the test
print("Running JSON bleed-through fix tests...\n")
success = test_format_structured_data()
if success:
print("\nAll tests passed! The JSON bleed-through issue is fixed.")
else:
print("\nSome tests failed.")