Spaces:
Running
Running
import re | |
import ast | |
from .text_utils import clean_raw_text, format_markdown_text | |
def classify_document_content(result): | |
"""Classify document content based on structure and content""" | |
classification = { | |
'has_title': False, | |
'has_content': False, | |
'has_sections': False, | |
'is_structured': False | |
} | |
if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict): | |
return classification | |
# Check for title | |
if 'title' in result['ocr_contents'] and result['ocr_contents']['title']: | |
classification['has_title'] = True | |
# Check for content | |
content_fields = ['content', 'transcript', 'text'] | |
for field in content_fields: | |
if field in result['ocr_contents'] and result['ocr_contents'][field]: | |
classification['has_content'] = True | |
break | |
# Check for sections | |
section_count = 0 | |
for key in result['ocr_contents'].keys(): | |
if key not in ['raw_text', 'error'] and result['ocr_contents'][key]: | |
section_count += 1 | |
classification['has_sections'] = section_count > 2 | |
# Check if structured | |
classification['is_structured'] = ( | |
classification['has_title'] and | |
classification['has_content'] and | |
classification['has_sections'] | |
) | |
return classification | |
def extract_document_text(result): | |
"""Extract main document text content""" | |
if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict): | |
return "" | |
# Try to get the text from content fields in preferred order - prioritize main_text | |
for field in ['main_text', 'content', 'transcript', 'text', 'raw_text']: | |
if field in result['ocr_contents'] and result['ocr_contents'][field]: | |
content = result['ocr_contents'][field] | |
if isinstance(content, str): | |
return content | |
return "" | |
def extract_image_description(image_data): | |
"""Extract image description from data""" | |
if not image_data or not isinstance(image_data, dict): | |
return "" | |
# Try different fields that might contain descriptions | |
for field in ['alt_text', 'caption', 'description']: | |
if field in image_data and image_data[field]: | |
return image_data[field] | |
return "" | |
def format_structured_data(content): | |
"""Format structured data like lists and dictionaries into readable markdown | |
Args: | |
content: The content to format (str, list, dict) | |
Returns: | |
Formatted markdown text | |
""" | |
if not content: | |
return "" | |
# For string content, return as-is to maintain content purity | |
# This prevents JSON-like text from being transformed inappropriately | |
if isinstance(content, str): | |
return content | |
# Handle native Python lists | |
if isinstance(content, list): | |
if not content: | |
return "" | |
# Convert to markdown bullet points | |
return "\n".join([f"- {item}" for item in content]) | |
# Handle native Python dictionaries | |
elif isinstance(content, dict): | |
if not content: | |
return "" | |
# Convert to markdown key-value pairs | |
return "\n".join([f"**{k}**: {v}" for k, v in content.items()]) | |
# Return as string for other types | |
return str(content) | |