diff --git "a/api/utils.py" "b/api/utils.py" new file mode 100644--- /dev/null +++ "b/api/utils.py" @@ -0,0 +1,2827 @@ +""" +Core processing utilities for DocGenie document generation pipeline. + +Integrated functionality (All 19 Stages): +- Stage 1-2: Seed selection, LLM prompting, response processing, PDF rendering, bbox extraction +- Stage 3: Handwriting & visual element synthesis (WordStylist diffusion, stamps, barcodes, logos) +- Stage 4: Image finalization & OCR (pdf2image, Microsoft Document Intelligence) +- Stage 5: Dataset packaging (bbox normalization, GT verification, analysis, debug viz) + +References generationfolder for core pipeline logic. +""" +import asyncio +import base64 +import json +import pathlib +import tempfile +import time +import uuid +import re +from typing import List, Tuple, Optional, Dict, Any +from io import BytesIO + +import requests +import httpx +from PIL import Image +from pdf2image import convert_from_path +from bs4 import BeautifulSoup +from playwright.async_api import async_playwright +import fitz # PyMuPDF for PDF processing + +from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME, VISUAL_ELEMENT_TYPE_SYNONYMS +from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient, create_message +from docgenie.generation.pipeline_03_process_response import ( + extract_html_documents_from_text, + extract_gt, +) +from docgenie.generation.pipeline_03.css import ( + increase_handwriting_font_size, + unmark_visual_elements, +) +from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import ( + render_pdf_async, + preprocess_html_for_pdf, +) +from docgenie.generation.pipeline_04.extract_bbox import extract_bboxes_from_pdf + +# Stage 3 imports - we implement simplified versions directly in this file +# The full pipeline functions are available but require SynDatasetDefinition +# For API use, we extract elements directly from HTML/CSS +from docgenie.generation.utils.pdfjs import MEASURE_DIMENSIONS +from docgenie.generation.utils.stamp import create_stamp +from docgenie import ENV + +# Import config for handwriting service URL +from .config import settings + + +async def download_image_to_base64(url: str) -> str: + """ + Download image or PDF from URL and convert to base64 JPEG. + If URL points to a PDF, converts the first page to an image. + + Args: + url: Image or PDF URL + + Returns: + Base64-encoded JPEG image string + """ + response = requests.get(url, timeout=30) + response.raise_for_status() + + content_type = response.headers.get('Content-Type', '').lower() + is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') + + if is_pdf: + # Handle PDF: convert first page to image + print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") + + # Load PDF from bytes + pdf_document = fitz.open(stream=response.content, filetype="pdf") + + if len(pdf_document) == 0: + raise ValueError("PDF has no pages") + + # Render first page to image at high DPI + page = pdf_document[0] + # Use 300 DPI for high quality (matrix zoom factor = DPI/72) + zoom = 300 / 72 + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + + # Convert pixmap to PIL Image + img_data = pix.tobytes("png") + img = Image.open(BytesIO(img_data)) + + pdf_document.close() + + print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") + else: + # Handle regular image + img = Image.open(BytesIO(response.content)) + + # Convert to RGB if necessary + if img.mode != 'RGB': + img = img.convert('RGB') + + # Save as JPEG in memory + buffer = BytesIO() + img.save(buffer, format='JPEG', quality=95) + buffer.seek(0) + + # Encode to base64 + img_base64 = base64.b64encode(buffer.read()).decode('utf-8') + return img_base64 + + +def download_seed_images(urls: List[str]) -> List[str]: + """ + Download multiple seed images/PDFs and convert to base64 (synchronous version for worker). + If a URL points to a PDF, converts the first page to an image. + Implements retry logic for transient HTTP errors (503, 502, 504, 429). + + Args: + urls: List of image or PDF URLs + + Returns: + List of base64-encoded JPEG image strings + """ + images = [] + for url in urls: + # Retry logic for transient HTTP errors + max_retries = 3 + response = None + + for attempt in range(max_retries): + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + break # Success, exit retry loop + + except requests.exceptions.HTTPError as e: + # Retry on transient server errors + if e.response.status_code in [502, 503, 504, 429]: + if attempt < max_retries - 1: + wait_time = 2 * (2 ** attempt) # Exponential backoff: 2s, 4s, 8s + print(f" ⚠️ HTTP {e.response.status_code} error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})...") + time.sleep(wait_time) + continue + # Non-retryable error or last attempt + raise + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt < max_retries - 1: + wait_time = 2 * (2 ** attempt) + print(f" ⚠️ Network error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries}): {e}") + time.sleep(wait_time) + continue + raise + + if response is None: + raise Exception(f"Failed to download seed image after {max_retries} attempts") + + content_type = response.headers.get('Content-Type', '').lower() + is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') + + if is_pdf: + # Handle PDF: convert first page to image + print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") + + # Load PDF from bytes + pdf_document = fitz.open(stream=response.content, filetype="pdf") + + if len(pdf_document) == 0: + raise ValueError("PDF has no pages") + + # Render first page to image at high DPI + page = pdf_document[0] + # Use 300 DPI for high quality (matrix zoom factor = DPI/72) + zoom = 300 / 72 + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + + # Convert pixmap to PIL Image + img_data = pix.tobytes("png") + img = Image.open(BytesIO(img_data)) + + pdf_document.close() + + print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") + else: + # Handle regular image + img = Image.open(BytesIO(response.content)) + + # Convert to RGB if necessary + if img.mode != 'RGB': + img = img.convert('RGB') + + # Save as JPEG in memory + buffer = BytesIO() + img.save(buffer, format='JPEG', quality=95) + buffer.seek(0) + + # Encode to base64 + img_base64 = base64.b64encode(buffer.read()).decode('utf-8') + images.append(img_base64) + + return images + + +def build_prompt( + language: str, + doc_type: str, + gt_type: str, + gt_format: str, + num_solutions: int, + num_seed_images: int, + prompt_template_path: pathlib.Path, + enable_visual_elements: bool = True, + visual_element_types: List[str] = None +) -> str: + """ + Build the system prompt by injecting parameters into template. + + Args: + language: Language for documents + doc_type: Type of documents + gt_type: Ground truth type description + gt_format: Ground truth format specification + num_solutions: Number of documents to generate + num_seed_images: Number of seed images provided + prompt_template_path: Path to prompt template file + enable_visual_elements: Whether to include visual element instructions + visual_element_types: List of allowed visual element types + + Returns: + Formatted prompt string + """ + template = prompt_template_path.read_text(encoding='utf-8') + + # Handle dynamic Visual Placeholders block + import re + + # Define placeholder block pattern + ve_block_pattern = r"## Visual Placeholders \(if document type requires\)\n(.*?)\n\n" + + if not enable_visual_elements or not visual_element_types: + # Remove the whole block + template = re.sub(ve_block_pattern, "", template, flags=re.DOTALL) + # Also remove the checklist item + template = template.replace("- [ ] Visual elements are semantically coherent\n", "") + else: + # Update the block with specific types + types_str = ", ".join(visual_element_types) + + # Example mapping + EXAMPLES = { + "stamp": '- Example: `
`', + "logo": '- Example: ``', + "figure": '- Example: ``', + "barcode": '- Example: ``', + "photo": '- Example: ``' + } + + # Select examples + selected_examples = [] + for t in visual_element_types: + if t in EXAMPLES: + selected_examples.append(EXAMPLES[t]) + if len(selected_examples) >= 2: + break + + # Fallback if somehow no types matched (shouldn't happen with valid types) + if len(selected_examples) == 0: + selected_examples = [EXAMPLES["logo"], EXAMPLES["stamp"]] + + new_block = [ + "## Visual Placeholders (if document type requires)", + "- Insert `