| """ |
| Figure extraction and processing |
| Uses PIL images directly from Docling extraction |
| """ |
|
|
| from typing import List, Dict, Any |
| from PIL import Image |
|
|
|
|
| def extract_figures(page_images: List[Image.Image], figures_info: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
| """ |
| Process extracted figures from Docling. |
| |
| Docling already extracts the figure images directly, so this function: |
| - Uses the PIL images from Docling (figures_info contains 'image' field) |
| - Creates thumbnails for gallery display |
| - Preserves metadata |
| |
| Args: |
| page_images: List of PIL Images (pages) - not used since Docling provides images |
| figures_info: List of figure metadata with 'image', 'bbox', 'page', 'caption' |
| |
| Returns: |
| List of dicts with 'image' (PIL), 'thumbnail', 'bbox', 'page', 'caption' |
| """ |
| results = [] |
|
|
| try: |
| for fig_info in figures_info: |
| |
| pil_image = fig_info.get("image") |
|
|
| if pil_image is None: |
| print(f"⚠️ Figure missing image: {fig_info}") |
| continue |
|
|
| |
| if not isinstance(pil_image, Image.Image): |
| print(f"⚠️ Figure image is not PIL Image: {type(pil_image)}") |
| continue |
|
|
| |
| thumb = pil_image.copy() |
| thumb.thumbnail((200, 200), Image.Resampling.LANCZOS) |
|
|
| results.append({ |
| "image": pil_image, |
| "thumbnail": thumb, |
| "bbox": fig_info.get("bbox"), |
| "page": fig_info.get("page", 0), |
| "caption": fig_info.get("caption", ""), |
| }) |
|
|
| print(f"✅ Processed figure {len(results)}") |
|
|
| except Exception as e: |
| print(f"⚠️ Error processing figures: {e}") |
| import traceback |
| traceback.print_exc() |
|
|
| |
| if not results and page_images: |
| print("⚠️ No figures extracted, using page previews as fallback") |
| for i, page_img in enumerate(page_images[:3]): |
| thumb = page_img.copy() |
| thumb.thumbnail((200, 200), Image.Resampling.LANCZOS) |
| results.append({ |
| "image": page_img, |
| "thumbnail": thumb, |
| "bbox": None, |
| "page": i, |
| "caption": f"Page {i + 1}", |
| }) |
|
|
| return results |
|
|