import re import ast from .text_utils import clean_raw_text, format_markdown_text def classify_document_content(result): """Classify document content based on structure and content""" classification = { 'has_title': False, 'has_content': False, 'has_sections': False, 'is_structured': False } if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict): return classification # Check for title if 'title' in result['ocr_contents'] and result['ocr_contents']['title']: classification['has_title'] = True # Check for content content_fields = ['content', 'transcript', 'text'] for field in content_fields: if field in result['ocr_contents'] and result['ocr_contents'][field]: classification['has_content'] = True break # Check for sections section_count = 0 for key in result['ocr_contents'].keys(): if key not in ['raw_text', 'error'] and result['ocr_contents'][key]: section_count += 1 classification['has_sections'] = section_count > 2 # Check if structured classification['is_structured'] = ( classification['has_title'] and classification['has_content'] and classification['has_sections'] ) return classification def extract_document_text(result): """Extract main document text content""" if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict): return "" # Try to get the text from content fields in preferred order - prioritize main_text for field in ['main_text', 'content', 'transcript', 'text', 'raw_text']: if field in result['ocr_contents'] and result['ocr_contents'][field]: content = result['ocr_contents'][field] if isinstance(content, str): return content return "" def extract_image_description(image_data): """Extract image description from data""" if not image_data or not isinstance(image_data, dict): return "" # Try different fields that might contain descriptions for field in ['alt_text', 'caption', 'description']: if field in image_data and image_data[field]: return image_data[field] return "" def format_structured_data(content): """Format structured data like lists and dictionaries into readable markdown Args: content: The content to format (str, list, dict) Returns: Formatted markdown text """ if not content: return "" # For string content, return as-is to maintain content purity # This prevents JSON-like text from being transformed inappropriately if isinstance(content, str): return content # Handle native Python lists if isinstance(content, list): if not content: return "" # Convert to markdown bullet points return "\n".join([f"- {item}" for item in content]) # Handle native Python dictionaries elif isinstance(content, dict): if not content: return "" # Convert to markdown key-value pairs return "\n".join([f"**{k}**: {v}" for k, v in content.items()]) # Return as string for other types return str(content)