| """ |
| Dataset Export Manager for DocGenie API |
| |
| Handles organizing generated documents into a proper dataset structure |
| following the original pipeline's SyntheticDatasetFileStructure pattern. |
| """ |
|
|
| import pathlib |
| import json |
| import base64 |
| import shutil |
| from collections import Counter |
| from typing import Dict, List, Optional, Any |
|
|
|
|
| class DatasetExporter: |
| """ |
| Manages export of generated documents to organized dataset structure. |
| |
| Structure follows original pipeline pattern: |
| - Single msgpack for all documents |
| - Categorized folders (html/, pdf/, bbox/, etc.) |
| - Subfolders for per-document tokens |
| """ |
| |
| def __init__(self, base_path: pathlib.Path, dataset_name: str = "docgenie_documents"): |
| """ |
| Initialize dataset exporter. |
| |
| Args: |
| base_path: Base directory for dataset export |
| dataset_name: Name of the dataset (will be subfolder name) |
| """ |
| self.base_path = base_path / dataset_name |
| self.dataset_name = dataset_name |
| self.documents = [] |
| |
| |
| self._create_directory_structure() |
| |
| |
| self.cost_summary = { |
| "total_cost_usd": 0.0, |
| "total_input_tokens": 0, |
| "total_output_tokens": 0, |
| "total_cache_creation_tokens": 0, |
| "total_cache_read_tokens": 0, |
| "num_messages": 0 |
| } |
| |
| def add_cost(self, cost_usd: float, input_tokens: int, output_tokens: int, |
| cache_creation_tokens: int = 0, cache_read_tokens: int = 0): |
| """Add LLM cost and token usage to global summary.""" |
| self.cost_summary["total_cost_usd"] += cost_usd |
| self.cost_summary["total_input_tokens"] += input_tokens |
| self.cost_summary["total_output_tokens"] += output_tokens |
| self.cost_summary["total_cache_creation_tokens"] += cache_creation_tokens |
| self.cost_summary["total_cache_read_tokens"] += cache_read_tokens |
| self.cost_summary["num_messages"] += 1 |
| |
| def _create_directory_structure(self): |
| """Create the organized directory structure.""" |
| directories = [ |
| |
| self.base_path, |
| |
| |
| self.html_dir, |
| |
| |
| self.pdf_initial_dir, |
| self.pdf_with_handwriting_dir, |
| self.pdf_with_visual_elements_dir, |
| self.pdf_final_dir, |
| |
| |
| self.img_dir, |
| |
| |
| self.bbox_pdf_word_dir, |
| self.bbox_pdf_char_dir, |
| self.bbox_final_word_dir, |
| self.bbox_final_segment_dir, |
| self.bbox_final_normalized_word_dir, |
| self.bbox_final_normalized_segment_dir, |
| |
| |
| self.raw_annotations_dir, |
| self.gt_dir, |
| self.gt_verification_dir, |
| self.token_mapping_dir, |
| |
| |
| self.handwriting_regions_dir, |
| self.handwriting_tokens_dir, |
| |
| |
| self.visual_element_definitions_dir, |
| self.visual_element_images_dir, |
| |
| |
| self.layout_dir, |
| |
| |
| self.geometries_dir, |
| |
| |
| self.ocr_results_dir, |
| |
| |
| self.analysis_dir, |
| |
| |
| self.debug_dir, |
| ] |
| |
| for directory in directories: |
| directory.mkdir(parents=True, exist_ok=True) |
| |
| |
| |
| @property |
| def html_dir(self) -> pathlib.Path: |
| """HTML and CSS files""" |
| return self.base_path / "html" |
| |
| @property |
| def pdf_initial_dir(self) -> pathlib.Path: |
| """PDFs before any synthesis""" |
| return self.base_path / "pdf" / "pdf_initial" |
| |
| @property |
| def pdf_with_handwriting_dir(self) -> pathlib.Path: |
| """PDFs with only handwriting added""" |
| return self.base_path / "pdf" / "pdf_with_handwriting" |
| |
| @property |
| def pdf_with_visual_elements_dir(self) -> pathlib.Path: |
| """PDFs with only visual elements added""" |
| return self.base_path / "pdf" / "pdf_with_visual_elements" |
| |
| @property |
| def pdf_final_dir(self) -> pathlib.Path: |
| """PDFs with both handwriting and visual elements""" |
| return self.base_path / "pdf" / "pdf_final" |
| |
| @property |
| def img_dir(self) -> pathlib.Path: |
| """Final rendered images""" |
| return self.base_path / "img" |
| |
| @property |
| def bbox_pdf_word_dir(self) -> pathlib.Path: |
| """Word-level bounding boxes extracted from PDF (ground truth positions)""" |
| return self.base_path / "bbox" / "bbox_pdf" / "word" |
| |
| @property |
| def bbox_pdf_char_dir(self) -> pathlib.Path: |
| """Character-level bounding boxes extracted from PDF""" |
| return self.base_path / "bbox" / "bbox_pdf" / "char" |
| |
| @property |
| def bbox_final_word_dir(self) -> pathlib.Path: |
| """Final word-level bounding boxes (from OCR if modifications applied, else from PDF)""" |
| return self.base_path / "bbox" / "bbox_final" / "word" |
| |
| @property |
| def bbox_final_segment_dir(self) -> pathlib.Path: |
| """Final segment-level bounding boxes (from OCR if modifications applied, else from PDF)""" |
| return self.base_path / "bbox" / "bbox_final" / "segment" |
| |
| @property |
| def bbox_final_normalized_word_dir(self) -> pathlib.Path: |
| """Normalized word-level bounding boxes""" |
| return self.base_path / "bbox" / "bbox_final_normalized" / "word" |
| |
| @property |
| def bbox_final_normalized_segment_dir(self) -> pathlib.Path: |
| """Normalized segment-level bounding boxes""" |
| return self.base_path / "bbox" / "bbox_final_normalized" / "segment" |
| |
| @property |
| def raw_annotations_dir(self) -> pathlib.Path: |
| """Raw annotations (layout boxes before normalization)""" |
| return self.base_path / "annotations" / "raw_annotations" |
| |
| @property |
| def gt_dir(self) -> pathlib.Path: |
| """Ground truth annotations""" |
| return self.base_path / "annotations" / "gt" |
| |
| @property |
| def gt_verification_dir(self) -> pathlib.Path: |
| """Ground truth verification results""" |
| return self.base_path / "annotations" / "gt_verification" |
| |
| @property |
| def token_mapping_dir(self) -> pathlib.Path: |
| """Token mapping files""" |
| return self.base_path / "annotations" / "token_mapping" |
| |
| @property |
| def handwriting_regions_dir(self) -> pathlib.Path: |
| """Handwriting region definitions""" |
| return self.base_path / "handwriting" / "handwriting_regions" |
| |
| @property |
| def handwriting_tokens_dir(self) -> pathlib.Path: |
| """Handwriting token images (per-document subfolders)""" |
| return self.base_path / "handwriting" / "handwriting_tokens" |
| |
| @property |
| def visual_element_definitions_dir(self) -> pathlib.Path: |
| """Visual element definitions""" |
| return self.base_path / "visual_elements" / "visual_element_definitions" |
| |
| @property |
| def visual_element_images_dir(self) -> pathlib.Path: |
| """Visual element images (per-document subfolders)""" |
| return self.base_path / "visual_elements" / "visual_element_images" |
| |
| @property |
| def layout_dir(self) -> pathlib.Path: |
| """Layout element definitions""" |
| return self.base_path / "layout" |
| |
| @property |
| def geometries_dir(self) -> pathlib.Path: |
| """Extracted geometries from HTML""" |
| return self.base_path / "geometries" |
| |
| @property |
| def ocr_results_dir(self) -> pathlib.Path: |
| """OCR results""" |
| return self.base_path / "ocr_results" |
| |
| @property |
| def analysis_dir(self) -> pathlib.Path: |
| """Analysis statistics""" |
| return self.base_path / "analysis" |
| |
| @property |
| def debug_dir(self) -> pathlib.Path: |
| """Debug visualizations""" |
| return self.base_path / "debug" |
| |
| @property |
| def msgpack_path(self) -> pathlib.Path: |
| """ |
| Path to the dataset msgpack file. |
| |
| This file aggregates all documents in the dataset into a single msgpack |
| for efficient loading during ML training. |
| """ |
| return self.base_path / "dataset.msgpack" |
| |
| @property |
| def metadata_path(self) -> pathlib.Path: |
| """Path to dataset metadata JSON""" |
| return self.base_path / "metadata.json" |
| |
| |
| |
| def add_document( |
| self, |
| document_id: str, |
| html: str, |
| css: str, |
| pdf_initial: Optional[bytes] = None, |
| pdf_with_handwriting: Optional[bytes] = None, |
| pdf_with_visual_elements: Optional[bytes] = None, |
| pdf_final: Optional[bytes] = None, |
| final_image: Optional[bytes] = None, |
| ground_truth: Optional[dict] = None, |
| raw_annotations: Optional[list] = None, |
| bboxes_pdf_word: Optional[list] = None, |
| bboxes_pdf_char: Optional[list] = None, |
| bboxes_final_word: Optional[list] = None, |
| bboxes_final_segment: Optional[list] = None, |
| bboxes_normalized_word: Optional[dict] = None, |
| bboxes_normalized_segment: Optional[dict] = None, |
| gt_verification: Optional[dict] = None, |
| token_mapping: Optional[dict] = None, |
| handwriting_regions: Optional[list] = None, |
| handwriting_images: Optional[dict] = None, |
| visual_elements: Optional[list] = None, |
| visual_element_images: Optional[dict] = None, |
| layout_elements: Optional[list] = None, |
| geometries: Optional[list] = None, |
| ocr_results: Optional[dict] = None, |
| analysis_stats: Optional[dict] = None, |
| debug_visualization: Optional[bytes] = None, |
| ): |
| """ |
| Add a document to the dataset export. |
| |
| Args: |
| document_id: Unique document identifier |
| html: Document HTML content |
| css: Document CSS content |
| pdf_initial: Initial PDF bytes (before modifications) |
| pdf_with_handwriting: PDF bytes after handwriting insertion |
| pdf_with_visual_elements: PDF bytes after visual element insertion (no handwriting) |
| pdf_final: PDF bytes with both handwriting and visual elements |
| final_image: Final rendered image (PNG bytes) |
| ground_truth: Ground truth annotations |
| raw_annotations: Raw layout boxes (before normalization) |
| bboxes_pdf_word: Word-level bboxes from PDF (ground truth) |
| bboxes_pdf_char: Character-level bboxes from PDF |
| bboxes_final_word: Final word-level bboxes (OCR or PDF) |
| bboxes_final_segment: Final segment-level bboxes (OCR or PDF) |
| bboxes_normalized_word: Normalized word-level bboxes |
| bboxes_normalized_segment: Normalized segment-level bboxes |
| gt_verification: Ground truth verification results |
| token_mapping: Token to bbox mapping |
| handwriting_regions: Handwriting region metadata |
| handwriting_images: Dict of handwriting token images |
| visual_elements: Visual element metadata |
| visual_element_images: Dict of visual element images |
| layout_elements: Layout element definitions |
| geometries: Extracted geometries from HTML |
| ocr_results: OCR results |
| analysis_stats: Analysis statistics |
| debug_visualization: Debug visualization image (PNG bytes) |
| """ |
| |
| (self.html_dir / f"{document_id}.html").write_text(html, encoding='utf-8') |
| (self.html_dir / f"{document_id}.css").write_text(css, encoding='utf-8') |
| |
| |
| if pdf_initial: |
| (self.pdf_initial_dir / f"{document_id}.pdf").write_bytes(pdf_initial) |
| |
| if pdf_with_handwriting: |
| (self.pdf_with_handwriting_dir / f"{document_id}.pdf").write_bytes(pdf_with_handwriting) |
| |
| if pdf_with_visual_elements: |
| (self.pdf_with_visual_elements_dir / f"{document_id}.pdf").write_bytes(pdf_with_visual_elements) |
| |
| if pdf_final: |
| (self.pdf_final_dir / f"{document_id}.pdf").write_bytes(pdf_final) |
| |
| |
| if final_image: |
| (self.img_dir / f"{document_id}.png").write_bytes(final_image) |
| |
| |
| if raw_annotations: |
| (self.raw_annotations_dir / f"{document_id}.json").write_text( |
| json.dumps(raw_annotations, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if ground_truth: |
| (self.gt_dir / f"{document_id}.json").write_text( |
| json.dumps(ground_truth, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if gt_verification: |
| (self.gt_verification_dir / f"{document_id}.json").write_text( |
| json.dumps(gt_verification, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if token_mapping: |
| (self.token_mapping_dir / f"{document_id}.json").write_text( |
| json.dumps(token_mapping, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| |
| if bboxes_pdf_word: |
| (self.bbox_pdf_word_dir / f"{document_id}.json").write_text( |
| json.dumps(bboxes_pdf_word, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if bboxes_pdf_char: |
| (self.bbox_pdf_char_dir / f"{document_id}.json").write_text( |
| json.dumps(bboxes_pdf_char, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if bboxes_final_word: |
| (self.bbox_final_word_dir / f"{document_id}.json").write_text( |
| json.dumps(bboxes_final_word, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if bboxes_final_segment: |
| (self.bbox_final_segment_dir / f"{document_id}.json").write_text( |
| json.dumps(bboxes_final_segment, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if bboxes_normalized_word: |
| (self.bbox_final_normalized_word_dir / f"{document_id}.json").write_text( |
| json.dumps(bboxes_normalized_word, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if bboxes_normalized_segment: |
| (self.bbox_final_normalized_segment_dir / f"{document_id}.json").write_text( |
| json.dumps(bboxes_normalized_segment, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| |
| if handwriting_regions: |
| (self.handwriting_regions_dir / f"{document_id}.json").write_text( |
| json.dumps(handwriting_regions, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if handwriting_images: |
| |
| doc_hw_tokens_dir = self.handwriting_tokens_dir / document_id |
| doc_hw_tokens_dir.mkdir(parents=True, exist_ok=True) |
| |
| for hw_id, img_data_raw in handwriting_images.items(): |
| |
| if isinstance(img_data_raw, dict): |
| img_b64 = img_data_raw.get('image_base64') |
| else: |
| img_b64 = img_data_raw |
| |
| if img_b64: |
| img_bytes = base64.b64decode(img_b64) |
| (doc_hw_tokens_dir / f"{hw_id}.png").write_bytes(img_bytes) |
| |
| |
| if visual_elements: |
| (self.visual_element_definitions_dir / f"{document_id}.json").write_text( |
| json.dumps(visual_elements, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if visual_element_images: |
| |
| doc_ve_images_dir = self.visual_element_images_dir / document_id |
| doc_ve_images_dir.mkdir(parents=True, exist_ok=True) |
| |
| for ve_id, img_b64 in visual_element_images.items(): |
| img_bytes = base64.b64decode(img_b64) |
| (doc_ve_images_dir / f"{ve_id}.png").write_bytes(img_bytes) |
| |
| |
| if layout_elements: |
| (self.layout_dir / f"{document_id}.json").write_text( |
| json.dumps(layout_elements, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if geometries: |
| (self.geometries_dir / f"{document_id}.json").write_text( |
| json.dumps(geometries, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if ocr_results: |
| (self.ocr_results_dir / f"{document_id}.json").write_text( |
| json.dumps(ocr_results, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if analysis_stats: |
| (self.analysis_dir / f"{document_id}.json").write_text( |
| json.dumps(analysis_stats, indent=2, ensure_ascii=False), encoding='utf-8' |
| ) |
| |
| if debug_visualization: |
| (self.debug_dir / f"{document_id}_debug.png").write_bytes(debug_visualization) |
| |
| |
| self.documents.append({ |
| 'document_id': document_id, |
| 'has_handwriting': handwriting_regions is not None and len(handwriting_regions) > 0, |
| 'has_visual_elements': visual_elements is not None and len(visual_elements) > 0, |
| 'has_ocr': ocr_results is not None, |
| 'modification_type': ( |
| "both" if pdf_final |
| else "handwriting" if pdf_with_handwriting |
| else "visual_elements" if pdf_with_visual_elements |
| else None |
| ) |
| }) |
| |
| def finalize( |
| self, |
| request_id: Optional[str] = None, |
| user_id: Optional[int] = None, |
| prompt_params: Optional[dict] = None, |
| api_mode: str = "sync" |
| ) -> pathlib.Path: |
| """ |
| Finalize the dataset export by creating metadata, README, and optionally msgpack. |
| |
| Args: |
| request_id: Request UUID for tracking |
| user_id: User ID who made the request |
| prompt_params: Prompt parameters used for generation |
| api_mode: "sync" or "async" |
| |
| Returns: |
| Path to the dataset base directory |
| """ |
| |
| global_stats = self._calculate_global_stats() |
| |
| |
| metadata = { |
| 'dataset_name': self.dataset_name, |
| 'num_documents': len(self.documents), |
| 'global_analysis': global_stats, |
| 'documents': self.documents, |
| 'structure_version': '2.1', |
| 'structure_description': 'Organized dataset with research-grade global analysis', |
| 'generation_metadata': { |
| 'request_id': request_id, |
| 'user_id': user_id, |
| 'api_mode': api_mode, |
| 'prompt_params': prompt_params or {} |
| } |
| } |
| |
| |
| metadata_json = json.dumps(metadata, indent=2, ensure_ascii=False) |
| self.metadata_path.write_text(metadata_json, encoding='utf-8') |
| (self.base_path / "dataset_log.json").write_text(metadata_json, encoding='utf-8') |
| |
| |
| readme_content = self._generate_readme() |
| (self.base_path / "README.md").write_text(readme_content, encoding='utf-8') |
| |
| |
| self._save_cost_report() |
| |
| |
| enable_dataset_export = prompt_params.get('enable_dataset_export', False) if prompt_params else False |
| dataset_export_format = prompt_params.get('dataset_export_format', 'msgpack') if prompt_params else 'msgpack' |
| |
| if enable_dataset_export and dataset_export_format.lower() == 'msgpack': |
| |
| enable_bbox_normalization = prompt_params.get('enable_bbox_normalization', False) if prompt_params else False |
| |
| if enable_bbox_normalization: |
| self._create_msgpack_dataset() |
| else: |
| print(f" β Msgpack export requested but bbox_normalization is disabled") |
| print(f" Msgpack requires normalized bboxes. Enable 'enable_bbox_normalization: true' to export msgpack.") |
| |
| return self.base_path |
| |
| def _create_msgpack_dataset(self): |
| """ |
| Create a single msgpack file aggregating all documents. |
| |
| This follows the original pipeline's approach of creating one msgpack |
| with all documents for easy loading in ML training pipelines. |
| """ |
| try: |
| from datadings.writer import FileWriter |
| |
| print(f" π¦ Creating msgpack dataset...") |
| |
| |
| samples = [] |
| for doc in self.documents: |
| doc_id = doc['document_id'] |
| |
| |
| bbox_word_path = self.bbox_final_normalized_word_dir / f"{doc_id}.json" |
| bbox_segment_path = self.bbox_final_normalized_segment_dir / f"{doc_id}.json" |
| |
| |
| if not bbox_word_path.exists(): |
| print(f" β Skipping {doc_id}: no normalized bboxes found") |
| continue |
| |
| |
| word_bboxes_data = json.loads(bbox_word_path.read_text(encoding='utf-8')) |
| |
| |
| if bbox_segment_path.exists(): |
| segment_bboxes_data = json.loads(bbox_segment_path.read_text(encoding='utf-8')) |
| else: |
| segment_bboxes_data = word_bboxes_data |
| |
| |
| words = [item.get('text', '') for item in word_bboxes_data] |
| |
| |
| word_bboxes = [ |
| [item['x0'], item['y0'], item['x2'], item['y2']] |
| for item in word_bboxes_data |
| ] |
| |
| |
| segment_bboxes = [ |
| [item['x0'], item['y0'], item['x2'], item['y2']] |
| for item in segment_bboxes_data |
| ] |
| |
| |
| gt_path = self.gt_dir / f"{doc_id}.json" |
| annotations = {} |
| if gt_path.exists(): |
| annotations = json.loads(gt_path.read_text(encoding='utf-8')) |
| |
| |
| img_path = self.img_dir / f"{doc_id}.png" |
| if not img_path.exists(): |
| |
| img_path = self.pdf_final_dir / f"{doc_id}.pdf" |
| if not img_path.exists(): |
| img_path = self.pdf_initial_dir / f"{doc_id}.pdf" |
| |
| |
| sample = { |
| 'key': doc_id, |
| 'sample_id': doc_id, |
| 'image_file_path': str(img_path), |
| 'words': words, |
| 'word_bboxes': word_bboxes, |
| 'segment_level_bboxes': segment_bboxes, |
| } |
| |
| |
| if annotations: |
| sample.update(annotations) |
| |
| |
| v_path = self.gt_verification_dir / f"{doc_id}.json" |
| if v_path.exists(): |
| v_data = json.loads(v_path.read_text(encoding='utf-8')) |
| sample['gt_verification'] = v_data |
| |
| sample['confirmed_keys'] = v_data.get('confirmed_keys', []) |
| sample['bbox_indices_per_key'] = v_data.get('bbox_indices_per_key', {}) |
|
|
| a_path = self.analysis_dir / f"{doc_id}.json" |
| if a_path.exists(): |
| a_data = json.loads(a_path.read_text(encoding='utf-8')) |
| sample['analysis_stats'] = a_data |
| |
| samples.append(sample) |
| |
| if not samples: |
| print(f" β No samples to write to msgpack - skipping") |
| return |
| |
| |
| with FileWriter(self.msgpack_path, overwrite=True) as writer: |
| for sample in samples: |
| writer.write(sample) |
| |
| print(f" β Created msgpack dataset: {self.msgpack_path.name} ({len(samples)} documents)") |
| |
| except ImportError: |
| print(f" β datadings not installed - skipping msgpack creation") |
| print(f" Install with: pip install datadings") |
| except Exception as e: |
| print(f" β Failed to create msgpack: {str(e)}") |
| import traceback |
| traceback.print_exc() |
| |
| def _calculate_global_stats(self) -> Dict[str, Any]: |
| """Aggregate stats from all documents in the dataset.""" |
| try: |
| total_docs = len(self.documents) |
| if total_docs == 0: |
| return {} |
| |
| error_counter = Counter() |
| has_handwriting = 0 |
| has_visual_elements = 0 |
| has_ocr = 0 |
| valid_docs = 0 |
| total_annotations = 0 |
| total_gt_bboxes = 0 |
| |
| for doc in self.documents: |
| doc_id = doc['document_id'] |
| a_path = self.analysis_dir / f"{doc_id}.json" |
| |
| if a_path.exists(): |
| try: |
| data = json.loads(a_path.read_text(encoding='utf-8')) |
| |
| |
| for err in data.get('errors', []): |
| error_counter[err] += 1 |
| |
| |
| if data.get('has_handwriting'): has_handwriting += 1 |
| if data.get('has_visual_elements'): has_visual_elements += 1 |
| if data.get('has_ocr'): has_ocr += 1 |
| if data.get('is_valid'): valid_docs += 1 |
| |
| |
| total_annotations += data.get('annotations_count', 0) |
| total_gt_bboxes += data.get('num_gt_bboxes', 0) |
| except: |
| pass |
| |
| |
| return { |
| "total_documents": total_docs, |
| "valid_documents": valid_docs, |
| "invalid_documents": total_docs - valid_docs, |
| "error_counts": dict(error_counter), |
| "features": { |
| "has_handwriting": has_handwriting, |
| "has_visual_elements": has_visual_elements, |
| "has_ocr": has_ocr |
| }, |
| "averages": { |
| "annotations_per_doc": total_annotations / total_docs if total_docs > 0 else 0, |
| "gt_bboxes_per_doc": total_gt_bboxes / total_docs if total_docs > 0 else 0 |
| } |
| } |
| except Exception as e: |
| print(f" β Failed to calculate global stats: {e}") |
| return {} |
| |
| def _generate_readme(self) -> str: |
| """Generate README content for the dataset.""" |
| return f"""# DocGenie Dataset: {self.dataset_name} |
| |
| Generated using DocGenie API - Synthetic Document Generation Pipeline |
| |
| ## Dataset Structure |
| |
| This dataset follows the original pipeline's organized structure with categorized folders: |
| |
| ``` |
| {self.dataset_name}/ |
| βββ dataset.msgpack # Aggregated dataset (all documents) |
| βββ metadata.json # Dataset metadata |
| βββ README.md # This file |
| β |
| βββ html/ # HTML and CSS files |
| β βββ document_1.html |
| β βββ document_1.css |
| β βββ ... |
| β |
| βββ pdf/ # PDF files at different stages |
| β βββ pdf_initial/ # Before synthesis |
| β βββ pdf_with_handwriting/ # With handwriting only |
| β βββ pdf_with_visual_elements/ # With visual elements only |
| β βββ pdf_final/ # With both features |
| β |
| βββ img/ # Final rendered images |
| β βββ document_1.png |
| β βββ ... |
| β |
| βββ bbox/ # Bounding boxes |
| β βββ bbox_pdf/ # Extracted from PDF (ground truth positions) |
| β β βββ word/ # Word-level from PDF |
| β β βββ char/ # Character-level from PDF |
| β βββ bbox_final/ # Final bboxes (OCR if modified, else PDF) |
| β β βββ word/ # Word-level (unnormalized) |
| β β βββ segment/ # Segment-level (unnormalized) |
| β βββ bbox_final_normalized/ # Normalized (0-1 range) |
| β βββ word/ # Word-level normalized |
| β βββ segment/ # Segment-level normalized |
| β |
| βββ annotations/ # Ground truth and mappings |
| β βββ raw_annotations/ # Raw layout boxes (before normalization) |
| β βββ gt/ # Ground truth annotations |
| β βββ gt_verification/ # Verification results |
| β βββ token_mapping/ # Token-to-bbox mappings |
| β |
| βββ handwriting/ # Handwriting data |
| β βββ handwriting_regions/ # Region definitions |
| β βββ handwriting_tokens/ # Token images (subfolders per document) |
| β βββ document_1/ |
| β β βββ hw1_b3_l1_w0.png |
| β β βββ ... |
| β βββ ... |
| β |
| βββ visual_elements/ # Visual element data |
| β βββ visual_element_definitions/ # Element definitions |
| β βββ visual_element_images/ # Element images (subfolders per document) |
| β βββ document_1/ |
| β β βββ ve0.png |
| β β βββ ... |
| β βββ ... |
| β |
| βββ layout/ # Layout element definitions |
| βββ geometries/ # Extracted geometries |
| βββ ocr_results/ # OCR results |
| βββ analysis/ # Analysis statistics |
| βββ debug/ # Debug visualizations |
| ``` |
| |
| ## Dataset Statistics |
| |
| - **Total Documents**: {len(self.documents)} |
| - **Documents with Handwriting**: {sum(1 for d in self.documents if d['has_handwriting'])} |
| - **Documents with Visual Elements**: {sum(1 for d in self.documents if d['has_visual_elements'])} |
| - **Documents with OCR**: {sum(1 for d in self.documents if d['has_ocr'])} |
| |
| ## Usage |
| |
| This dataset is designed for document understanding and OCR tasks. Files are organized by category for easy access and processing. |
| |
| ### Loading the Entire Dataset (Msgpack) |
| |
| The easiest way to load all documents for ML training: |
| |
| ```python |
| from datadings.reader import MsgpackReader |
| |
| # Load the aggregated dataset |
| reader = MsgpackReader('dataset.msgpack') |
| |
| # Iterate through all documents |
| for sample in reader: |
| doc_id = sample['sample_id'] |
| words = sample['words'] |
| word_bboxes = sample['word_bboxes'] # Normalized [x0, y0, x2, y2] |
| image_path = sample['image_file_path'] |
| # Ground truth annotations are included in the sample |
| ``` |
| |
| For more information on msgpack format, see: https://github.com/mweiss/datadings |
| |
| ### Loading Individual Documents |
| |
| Each document is identified by its `document_id` (e.g., "document_1"). To load a document: |
| |
| 1. **HTML/CSS**: `html/document_1.html`, `html/document_1.css` |
| 2. **PDF stages**: Check `pdf/pdf_initial/`, `pdf/pdf_final/`, etc. |
| 3. **Images**: `img/document_1.png` |
| 4. **Annotations**: `annotations/gt/document_1.json`, `annotations/raw_annotations/document_1.json` |
| 5. **Bounding boxes**: |
| - PDF-extracted (ground truth): `bbox/bbox_pdf/word/document_1.json`, `bbox/bbox_pdf/char/document_1.json` |
| - Final bboxes: `bbox/bbox_final/word/document_1.json` (OCR or PDF) |
| - Normalized: `bbox/bbox_final_normalized/word/document_1.json` |
| 6. **Tokens**: `handwriting/handwriting_tokens/document_1/`, `visual_elements/visual_element_images/document_1/` |
| |
| ### Notes |
| |
| - Bounding boxes in `bbox_pdf` are extracted from PDF and represent ground truth text positions |
| - Bounding boxes in `bbox_final` are from OCR (if document has handwriting/visual elements) or PDF (otherwise) |
| - Bounding boxes in `bbox_final_normalized` are normalized to [0, 1] range for ML training |
| - Character-level bboxes (`bbox_pdf/char/`) provide fine-grained text localization |
| - Raw annotations show the original layout boxes before normalization |
| - Token images are organized in per-document subfolders |
| - OCR results and analysis are only present if those features were enabled |
| |
| --- |
| Generated by DocGenie API v2.0 |
| """ |
| def _save_cost_report(self): |
| """Save a detailed cost report in research-grade format.""" |
| report_path = self.base_path / "cost_report.json" |
| |
| |
| |
| total_full_cost = self.cost_summary["total_cost_usd"] |
| discounted_cost = total_full_cost / 2.0 |
| |
| |
| valid_docs = len(self.documents) |
| if valid_docs > 0: |
| avg_cost = discounted_cost / valid_docs |
| else: |
| avg_cost = 0.0 |
| |
| final_report = { |
| **self.cost_summary, |
| "total_full_price_usd": total_full_cost, |
| "total_cost_usd": discounted_cost, |
| "batch_discount_applied": "50%", |
| "avg_cost_per_document": avg_cost, |
| "num_documents": valid_docs, |
| "currency": "USD" |
| } |
| |
| with open(report_path, 'w') as f: |
| json.dump(final_report, f, indent=2) |
| |
| print(f" β Cost report saved (with 50% batch discount): {report_path}") |
|
|