Ahadhassan-2003 commited on
Commit ·
9a40780
0
Parent(s):
deploy: update HF Space
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +59 -0
- .gitattributes +7 -0
- .gitignore +172 -0
- .gitlab-ci.yml +16 -0
- .python-version +1 -0
- API_FLOW_DOCUMENTATION.md +1024 -0
- ARCHITECTURE.md +278 -0
- DEPLOYMENT.md +875 -0
- Dockerfile +96 -0
- GENERATION_PIPELINE_DOCUMENTATION.md +0 -0
- LLM_PROJECT_CONTEXT_NOTE.md +254 -0
- README.md +454 -0
- TESTING_PLAN.md +1161 -0
- api/README.md +1220 -0
- api/TESTING.md +936 -0
- api/__init__.py +4 -0
- api/config.py +128 -0
- api/dataset_exporter.py +733 -0
- api/example_usage.py +143 -0
- api/google_drive.py +271 -0
- api/main.py +1756 -0
- api/quick_test.sh +93 -0
- api/requirements.txt +79 -0
- api/schemas.py +339 -0
- api/start.sh +42 -0
- api/start_worker.sh +95 -0
- api/supabase_client.py +283 -0
- api/test_api.py +261 -0
- api/test_async_api.py +321 -0
- api/test_get_google_token.py +274 -0
- api/test_runpod_integration.py +123 -0
- api/test_sync_pdf_api.py +312 -0
- api/utils.py +0 -0
- api/worker.py +804 -0
- data/docvqa_hw/handschrift_mit_qid.jsonl +83 -0
- data/docvqa_hw/zahlen_mit_qid.jsonl +20 -0
- data/exports/DocVQA_clip_kmeans.png +3 -0
- data/exports/DocVQA_layout_kmeans.png +3 -0
- data/exports/DocVQA_text_kmeans.png +3 -0
- data/models/handwriting/cached_vae/config.json +38 -0
- data/models/handwriting/char_vocab.json +89 -0
- data/models/handwriting/config.yaml +95 -0
- data/models/handwriting/writer_id_map.json +659 -0
- data/prompt_templates/Adaptation_GT/seed-based.txt +40 -0
- data/prompt_templates/Adaptation_GT/seed-free.txt +25 -0
- data/prompt_templates/ClaudeRefined1/seed-based.txt +78 -0
- data/prompt_templates/ClaudeRefined10/seed-based.txt +57 -0
- data/prompt_templates/ClaudeRefined11/seed-based.txt +55 -0
- data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt +55 -0
- data/prompt_templates/ClaudeRefined12/seed-based-json.txt +55 -0
.dockerignore
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore development artifacts
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
.Python
|
| 7 |
+
*.so
|
| 8 |
+
*.dylib
|
| 9 |
+
*.log
|
| 10 |
+
.venv/
|
| 11 |
+
venv/
|
| 12 |
+
ENV/
|
| 13 |
+
env/
|
| 14 |
+
.git/
|
| 15 |
+
.gitignore
|
| 16 |
+
.gitlab-ci.yml
|
| 17 |
+
*.md
|
| 18 |
+
!README.md
|
| 19 |
+
.pytest_cache/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
*~
|
| 23 |
+
.DS_Store
|
| 24 |
+
|
| 25 |
+
# Ignore data directories (too large for Docker context)
|
| 26 |
+
data/
|
| 27 |
+
!data/prompt_templates/
|
| 28 |
+
!data/visual_element_prefabs/
|
| 29 |
+
|
| 30 |
+
# Ignore build artifacts
|
| 31 |
+
*.egg-info/
|
| 32 |
+
dist/
|
| 33 |
+
build/
|
| 34 |
+
*.whl
|
| 35 |
+
|
| 36 |
+
# Ignore handwriting service (separate deployment)
|
| 37 |
+
handwriting_service/
|
| 38 |
+
|
| 39 |
+
# Ignore WordStylist (not needed for API)
|
| 40 |
+
WordStylist/
|
| 41 |
+
|
| 42 |
+
# Ignore scripts (not needed for API runtime)
|
| 43 |
+
scripts/
|
| 44 |
+
|
| 45 |
+
# Ignore documentation and deployment files
|
| 46 |
+
ARCHITECTURE.md
|
| 47 |
+
DEPLOYMENT.md
|
| 48 |
+
*.sh
|
| 49 |
+
!start.sh
|
| 50 |
+
!start_worker.sh
|
| 51 |
+
docker-compose.yml
|
| 52 |
+
railway.json
|
| 53 |
+
railway_setup_vars.sh
|
| 54 |
+
|
| 55 |
+
# Keep only essential code
|
| 56 |
+
!docgenie/
|
| 57 |
+
!api/
|
| 58 |
+
!setup.py
|
| 59 |
+
!pyproject.toml
|
.gitattributes
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.ico filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.svg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project
|
| 2 |
+
data/clusters/
|
| 3 |
+
data/embeddings/
|
| 4 |
+
data/temp/
|
| 5 |
+
wandb/
|
| 6 |
+
data/models/
|
| 7 |
+
data/webapp_cache/
|
| 8 |
+
data/analyzation/
|
| 9 |
+
data/cherrypicks/
|
| 10 |
+
data/hw_imgs/
|
| 11 |
+
/data/seed-images/*
|
| 12 |
+
/docgenie/playground/test.py
|
| 13 |
+
/docgenie/playground/handwritten_text/doc_vqa_handwriting_text_images
|
| 14 |
+
/docgenie/playground/handwritten_text/handwriting_raw_tokens
|
| 15 |
+
/docgenie/playground/handwritten_text/temp
|
| 16 |
+
data/datasets
|
| 17 |
+
data/models
|
| 18 |
+
data/cluster_plots
|
| 19 |
+
data/syn_dataset_statistics_plots
|
| 20 |
+
data/gt_embeddings
|
| 21 |
+
data/wandb_downloads
|
| 22 |
+
data/wandb_project_csvs
|
| 23 |
+
data/folders.txt
|
| 24 |
+
cache
|
| 25 |
+
runs
|
| 26 |
+
visualizations
|
| 27 |
+
.venv
|
| 28 |
+
**/**.__pycache__
|
| 29 |
+
/docgenie/playground/handwritten_text/doc_vqa_handwriting_text_images
|
| 30 |
+
/docgenie/playground/handwritten_text/temp
|
| 31 |
+
data/datasets
|
| 32 |
+
data/models
|
| 33 |
+
|
| 34 |
+
# Python
|
| 35 |
+
__pycache__/
|
| 36 |
+
*.py[cod]
|
| 37 |
+
*$py.class
|
| 38 |
+
*.so
|
| 39 |
+
.Python
|
| 40 |
+
build/
|
| 41 |
+
develop-eggs/
|
| 42 |
+
dist/
|
| 43 |
+
downloads/
|
| 44 |
+
eggs/
|
| 45 |
+
.eggs/
|
| 46 |
+
lib/
|
| 47 |
+
lib64/
|
| 48 |
+
parts/
|
| 49 |
+
sdist/
|
| 50 |
+
var/
|
| 51 |
+
wheels/
|
| 52 |
+
*.egg-info/
|
| 53 |
+
.installed.cfg
|
| 54 |
+
*.egg
|
| 55 |
+
MANIFEST
|
| 56 |
+
*.log
|
| 57 |
+
|
| 58 |
+
# Virtual environments
|
| 59 |
+
venv/
|
| 60 |
+
env/
|
| 61 |
+
ENV/
|
| 62 |
+
.venv
|
| 63 |
+
|
| 64 |
+
# IDE
|
| 65 |
+
.vscode/
|
| 66 |
+
.idea/
|
| 67 |
+
*.swp
|
| 68 |
+
*.swo
|
| 69 |
+
*~
|
| 70 |
+
.DS_Store
|
| 71 |
+
|
| 72 |
+
# Jupyter Notebook
|
| 73 |
+
.ipynb_checkpoints
|
| 74 |
+
*.ipynb_checkpoints/
|
| 75 |
+
|
| 76 |
+
# Model artifacts - download separately
|
| 77 |
+
inference/
|
| 78 |
+
inference_new/
|
| 79 |
+
inference_hf/
|
| 80 |
+
model/experiments/hf_conditional_latent/cached_vae/
|
| 81 |
+
*.zip
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Datasets - download separately
|
| 85 |
+
docvqa-handwritten-sizes4/
|
| 86 |
+
syn_docvqa/
|
| 87 |
+
iam_dataset/
|
| 88 |
+
iam_dataset_processed/
|
| 89 |
+
iam_dataset_processed_partial/
|
| 90 |
+
docvqa-test/
|
| 91 |
+
docvqa-viselems/
|
| 92 |
+
docvqa-viselems2/
|
| 93 |
+
temp/
|
| 94 |
+
generations/
|
| 95 |
+
|
| 96 |
+
# Generated outputs
|
| 97 |
+
output/
|
| 98 |
+
|
| 99 |
+
# Backup files
|
| 100 |
+
*.bak
|
| 101 |
+
*.backup
|
| 102 |
+
*.tmp
|
| 103 |
+
|
| 104 |
+
# Testing
|
| 105 |
+
.pytest_cache/
|
| 106 |
+
.coverage
|
| 107 |
+
htmlcov/
|
| 108 |
+
|
| 109 |
+
# OS
|
| 110 |
+
./data/clusters_old/
|
| 111 |
+
Thumbs.db
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# Training
|
| 115 |
+
training/
|
| 116 |
+
vae_evaluation/
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# Logs and checkpoints
|
| 120 |
+
*.pt
|
| 121 |
+
# But allow the inference model for handwriting service
|
| 122 |
+
!handwriting_service/WordStylist/models/ema_ckpt.pt
|
| 123 |
+
*.ckpt
|
| 124 |
+
*.pth
|
| 125 |
+
*.safetensors
|
| 126 |
+
|
| 127 |
+
.env
|
| 128 |
+
|
| 129 |
+
# Playwright
|
| 130 |
+
node_modules/
|
| 131 |
+
/test-results/
|
| 132 |
+
/playwright-report/
|
| 133 |
+
/blob-report/
|
| 134 |
+
/playwright/.cache/
|
| 135 |
+
/playwright/.auth/
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
!data/models/
|
| 139 |
+
!data/models/handwriting/
|
| 140 |
+
!data/models/handwriting/char_vocab.json
|
| 141 |
+
!data/models/handwriting/config.yaml
|
| 142 |
+
!data/models/handwriting/writer_id_map.json
|
| 143 |
+
!data/models/handwriting/cached_vae/config.json
|
| 144 |
+
data/models/.locks*
|
| 145 |
+
data/models/baseline
|
| 146 |
+
data/models/legacy
|
| 147 |
+
data/models/models*
|
| 148 |
+
data/models/pretrained
|
| 149 |
+
test_run.py
|
| 150 |
+
test_vlm.ipynb
|
| 151 |
+
test.ipynb
|
| 152 |
+
test2.ipynb
|
| 153 |
+
test3.py
|
| 154 |
+
test4.py
|
| 155 |
+
test5.py
|
| 156 |
+
test6.py
|
| 157 |
+
data/results
|
| 158 |
+
data/results_old/
|
| 159 |
+
data/tmp/
|
| 160 |
+
docgenie/playground/extract_02_eval_metrics_from_wandb.py
|
| 161 |
+
docgenie/playground/extract_metrics_from_wandb.py
|
| 162 |
+
data/cached_subsets
|
| 163 |
+
data/mixed_datasets
|
| 164 |
+
data/results_backup_v1
|
| 165 |
+
data/results_v1
|
| 166 |
+
data/old-results/
|
| 167 |
+
data/embeddings
|
| 168 |
+
data/mixed_datasets
|
| 169 |
+
data/results_backup_v1
|
| 170 |
+
sync_datasets.sh
|
| 171 |
+
data/results_latest
|
| 172 |
+
data/results_latest copy
|
.gitlab-ci.yml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# You can override the included template(s) by including variable overrides
|
| 2 |
+
# SAST customization: https://docs.gitlab.com/ee/user/application_security/sast/#customizing-the-sast-settings
|
| 3 |
+
# Secret Detection customization: https://docs.gitlab.com/user/application_security/secret_detection/pipeline/configure
|
| 4 |
+
# Dependency Scanning customization: https://docs.gitlab.com/ee/user/application_security/dependency_scanning/#customizing-the-dependency-scanning-settings
|
| 5 |
+
# Container Scanning customization: https://docs.gitlab.com/ee/user/application_security/container_scanning/#customizing-the-container-scanning-settings
|
| 6 |
+
# Note that environment variables can be set in several places
|
| 7 |
+
# See https://docs.gitlab.com/ee/ci/variables/#cicd-variable-precedence
|
| 8 |
+
stages:
|
| 9 |
+
- test
|
| 10 |
+
- secret-detection
|
| 11 |
+
variables:
|
| 12 |
+
SECRET_DETECTION_ENABLED: 'true'
|
| 13 |
+
secret_detection:
|
| 14 |
+
stage: secret-detection
|
| 15 |
+
include:
|
| 16 |
+
- template: Security/Secret-Detection.gitlab-ci.yml
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11.12
|
API_FLOW_DOCUMENTATION.md
ADDED
|
@@ -0,0 +1,1024 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Complete API Flow Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
The DocGenie API provides three endpoints for synthetic document generation, implementing a 19-stage pipeline that transforms seed images and prompts into complete datasets with OCR, ground truth, and optional handwriting/visual elements.
|
| 5 |
+
|
| 6 |
+
**Base URL**: `http://localhost:8000` (development) or Railway deployment
|
| 7 |
+
**Documentation**: `/docs` (FastAPI auto-generated Swagger UI)
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## API Endpoints
|
| 12 |
+
|
| 13 |
+
### 1. `/generate` - Legacy JSON Response (POST)
|
| 14 |
+
**Purpose**: Generate documents and return complete JSON metadata
|
| 15 |
+
**Response**: JSON with HTML, PDF (base64), bounding boxes, optional handwriting/visual elements
|
| 16 |
+
**Use Case**: Testing, development, full metadata inspection
|
| 17 |
+
**Pipeline Stages**: 1-19 (configurable via parameters)
|
| 18 |
+
|
| 19 |
+
### 2. `/generate/pdf` - Sync PDF+Dataset ZIP (POST)
|
| 20 |
+
**Purpose**: Generate documents and return ZIP file with all artifacts
|
| 21 |
+
**Response**: ZIP file containing:
|
| 22 |
+
- `*.pdf` - Generated document PDFs
|
| 23 |
+
- `*_final.pdf` - PDFs with handwriting/visual elements (if enabled)
|
| 24 |
+
- `*.msgpack` - Dataset format (if export enabled)
|
| 25 |
+
- `metadata.json` - Complete generation metadata
|
| 26 |
+
- `handwriting/` - Individual handwriting images
|
| 27 |
+
- `visual_elements/` - Individual visual element images
|
| 28 |
+
|
| 29 |
+
**Use Case**: Production dataset generation, batch processing
|
| 30 |
+
**Pipeline Stages**: 1-19 (all features available)
|
| 31 |
+
|
| 32 |
+
### 3. `/generate/async` - Async Batch Processing (POST)
|
| 33 |
+
**Purpose**: Queue large batch jobs via background worker (Redis Queue)
|
| 34 |
+
**Response**: Task ID for status polling
|
| 35 |
+
**Status Check**: `GET /generate/async/status/{task_id}`
|
| 36 |
+
**Result Download**: `GET /generate/async/result/{task_id}` (returns ZIP)
|
| 37 |
+
**Use Case**: Large-scale dataset generation (100+ documents)
|
| 38 |
+
**Pipeline Stages**: 1-19 (via worker.py)
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
## Request Parameters
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
class GenerateDocumentRequest:
|
| 46 |
+
seed_images: List[HttpUrl] # 1-8 seed images from web URLs
|
| 47 |
+
prompt_params: PromptParameters # Generation configuration
|
| 48 |
+
|
| 49 |
+
class PromptParameters:
|
| 50 |
+
# Core Parameters
|
| 51 |
+
language: str = "english" # Document language
|
| 52 |
+
doc_type: str = "invoice" # Document type (invoice, receipt, form, etc.)
|
| 53 |
+
gt_type: str = "qa" # Ground truth format (qa, kie)
|
| 54 |
+
gt_format: str = "json" # GT encoding (json, annotation)
|
| 55 |
+
num_solutions: int = 1 # Documents per seed set
|
| 56 |
+
|
| 57 |
+
# Feature Toggles (Stages 07-19)
|
| 58 |
+
enable_handwriting: bool = False # Stage 07-09, 12
|
| 59 |
+
handwriting_ratio: float = 0.2 # Probabilistic filter (0.0-1.0)
|
| 60 |
+
enable_visual_elements: bool = False # Stage 08, 10, 13
|
| 61 |
+
visual_element_types: List[str] = [] # Filter types: logo, photo, figure, barcode, etc.
|
| 62 |
+
enable_ocr: bool = True # Stage 15
|
| 63 |
+
enable_bbox_normalization: bool = True # Stage 16
|
| 64 |
+
enable_gt_verification: bool = False # Stage 17
|
| 65 |
+
enable_analysis: bool = False # Stage 18
|
| 66 |
+
enable_debug_visualization: bool = False # Stage 19
|
| 67 |
+
enable_dataset_export: bool = False # Stage 19 (msgpack format)
|
| 68 |
+
dataset_export_format: str = "msgpack" # Currently only msgpack supported
|
| 69 |
+
|
| 70 |
+
# Reproducibility
|
| 71 |
+
seed: Optional[int] = None # Random seed (null = random, int = reproducible)
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## Pipeline Architecture: The 19 Stages
|
| 77 |
+
|
| 78 |
+
The API implements all 19 stages of the original batch pipeline in `docgenie/generation/`. Each stage is mapped to corresponding functions in `api/utils.py`.
|
| 79 |
+
|
| 80 |
+
### **Phase 1: Core Pipeline (Stages 01-06)**
|
| 81 |
+
Generate base documents from seed images and LLM prompts.
|
| 82 |
+
|
| 83 |
+
#### **Stage 01: Seed Selection & Download**
|
| 84 |
+
- **Original**: `pipeline_01_select_seeds.py`
|
| 85 |
+
- **API**: `download_seed_images()` in `api/utils.py:117-161`
|
| 86 |
+
- **Process**:
|
| 87 |
+
1. Accept user-provided seed image URLs (1-8 images)
|
| 88 |
+
2. Download with retry logic (3 attempts, exponential backoff)
|
| 89 |
+
3. Handle transient HTTP errors (502, 503, 504, 429)
|
| 90 |
+
4. Convert to base64 for LLM input
|
| 91 |
+
- **Error Handling**: Retry with 2s, 4s, 8s delays; raise HTTPException on failure
|
| 92 |
+
|
| 93 |
+
#### **Stage 02: Prompt LLM**
|
| 94 |
+
- **Original**: `pipeline_02_prompt_llm.py`
|
| 95 |
+
- **API**: `call_claude_api_direct()` in `api/utils.py:550-600`
|
| 96 |
+
- **Process**:
|
| 97 |
+
1. Load prompt template: `data/prompt_templates/ClaudeRefined12/seed-based-json.txt`
|
| 98 |
+
2. Build prompt with parameters: language, doc_type, gt_type, num_solutions
|
| 99 |
+
3. Call Claude API (Anthropic Messages API v1)
|
| 100 |
+
- Model: `claude-3-5-sonnet-20241022` (configurable)
|
| 101 |
+
- Max tokens: 16,000
|
| 102 |
+
- Temperature: 1.0
|
| 103 |
+
- Vision: Send base64-encoded seed images
|
| 104 |
+
4. Receive HTML documents with embedded ground truth
|
| 105 |
+
- **LLM Output Format**: Multiple `<!DOCTYPE html>...</html>` blocks with:
|
| 106 |
+
- CSS styling with page dimensions
|
| 107 |
+
- HTML elements with semantic classes
|
| 108 |
+
- Handwriting markers: `class="handwritten author1"` (author1, author2, etc.)
|
| 109 |
+
- Visual element placeholders: `data-placeholder="logo"`, `data-content="company-logo"`
|
| 110 |
+
- Ground truth: `<script id="GT">{...json...}</script>`
|
| 111 |
+
|
| 112 |
+
#### **Stage 03: Process Response & Extract HTML**
|
| 113 |
+
- **Original**: `pipeline_03_process_response.py`
|
| 114 |
+
- **API**: `extract_html_documents_from_response()` in `api/utils.py:605-635`
|
| 115 |
+
- **Process**:
|
| 116 |
+
1. Parse LLM response for `<!DOCTYPE html>...</html>` blocks (regex)
|
| 117 |
+
2. Prettify HTML with BeautifulSoup
|
| 118 |
+
3. Validate HTML structure
|
| 119 |
+
4. Extract ground truth JSON from `<script id="GT">` tag
|
| 120 |
+
5. Remove GT script tag, clean HTML for rendering
|
| 121 |
+
- **Validation**: Check for required elements, CSS, proper structure
|
| 122 |
+
|
| 123 |
+
#### **Stage 04: Render PDF & Extract Geometries**
|
| 124 |
+
- **Original**: `pipeline_04_render_pdf_and_extract_geos.py`
|
| 125 |
+
- **API**: `render_html_to_pdf()` in `api/utils.py:650-740`
|
| 126 |
+
- **Process**:
|
| 127 |
+
1. Launch Playwright browser (Chromium)
|
| 128 |
+
2. Set page dimensions from CSS `@page` rule
|
| 129 |
+
3. Render HTML to PDF via `page.pdf()`
|
| 130 |
+
4. Extract element geometries:
|
| 131 |
+
- Handwriting elements: `.handwritten` class → `{rect, text, classes, selectorTypes: ["handwriting"]}`
|
| 132 |
+
- Visual elements: `[data-placeholder]` attribute → `{rect, dataPlaceholder, dataContent, selectorTypes: ["visual_element"]}`
|
| 133 |
+
5. Save PDF and geometries JSON
|
| 134 |
+
- **Output**:
|
| 135 |
+
- PDF at 72 DPI (PyMuPDF standard)
|
| 136 |
+
- Geometries at 96 DPI (browser rendering)
|
| 137 |
+
- Dimensions in mm
|
| 138 |
+
|
| 139 |
+
#### **Stage 05: Extract Bounding Boxes**
|
| 140 |
+
- **Original**: `pipeline_05_extract_bboxes_from_pdf.py`
|
| 141 |
+
- **API**: `extract_bboxes_from_rendered_pdf()` in `api/utils.py:750-825`
|
| 142 |
+
- **Process**:
|
| 143 |
+
1. Open PDF with PyMuPDF (fitz)
|
| 144 |
+
2. Extract text at word level: `page.get_text("words")`
|
| 145 |
+
3. Structure bboxes as:
|
| 146 |
+
```python
|
| 147 |
+
{
|
| 148 |
+
"text": "word",
|
| 149 |
+
"x0": float, # left
|
| 150 |
+
"y0": float, # top
|
| 151 |
+
"x1": float, # right (x2)
|
| 152 |
+
"y1": float, # bottom (y2)
|
| 153 |
+
"block_no": int,
|
| 154 |
+
"line_no": int,
|
| 155 |
+
"word_no": int
|
| 156 |
+
}
|
| 157 |
+
```
|
| 158 |
+
4. Filter whitespace-only text
|
| 159 |
+
5. Convert to OCRBox objects for processing
|
| 160 |
+
- **Coordinate System**: PDF points (72 DPI), origin top-left
|
| 161 |
+
|
| 162 |
+
#### **Stage 06: Validation**
|
| 163 |
+
- **Original**: `pipeline_06_validation.py` (implicit)
|
| 164 |
+
- **API**: `validate_html_structure()`, `validate_pdf()`, `validate_bboxes()` in `api/utils.py:830-890`
|
| 165 |
+
- **Checks**:
|
| 166 |
+
- HTML: Required DOCTYPE, head, body, CSS
|
| 167 |
+
- PDF: File readable, page count = 1, has text
|
| 168 |
+
- Bboxes: Minimum count (configurable), valid coordinates
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
### **Phase 2: Feature Synthesis (Stages 07-13)**
|
| 173 |
+
Add handwriting and visual elements to base documents.
|
| 174 |
+
|
| 175 |
+
#### **Stage 07: Extract Handwriting Definitions**
|
| 176 |
+
- **Original**: `pipeline_07_extract_handwriting.py`
|
| 177 |
+
- **API**: `process_stage3_complete()` section in `api/utils.py:1150-1235`
|
| 178 |
+
- **Process**:
|
| 179 |
+
1. Filter geometries: `"handwriting" in geo['selectorTypes']`
|
| 180 |
+
2. Parse classes: Extract `author1`, `author2`, etc. from `class="handwritten author1"`
|
| 181 |
+
3. **Probabilistic filtering** (handwriting_ratio):
|
| 182 |
+
```python
|
| 183 |
+
if random.random() > handwriting_ratio:
|
| 184 |
+
continue # Skip this element
|
| 185 |
+
```
|
| 186 |
+
- `ratio=0.0`: No handwriting (0%)
|
| 187 |
+
- `ratio=0.5`: ~50% of marked elements
|
| 188 |
+
- `ratio=1.0`: All marked elements (100%)
|
| 189 |
+
4. Match geometries to word bboxes:
|
| 190 |
+
- Convert browser coords (96 DPI) to PDF coords (72 DPI): `scale = 72/96 = 0.75`
|
| 191 |
+
- Find consecutive word bboxes matching geometry text
|
| 192 |
+
- Check bboxes are within geometry rect (threshold: 0.7)
|
| 193 |
+
- Track taken bbox indices to avoid duplicates
|
| 194 |
+
5. Build handwriting region definitions:
|
| 195 |
+
```python
|
| 196 |
+
{
|
| 197 |
+
"id": "hw0",
|
| 198 |
+
"text": "Patient Name",
|
| 199 |
+
"author_id": "author1",
|
| 200 |
+
"is_signature": False,
|
| 201 |
+
"rect": {x, y, width, height}, # in points
|
| 202 |
+
"bboxes": ["0_0_0 Patient 10.0 20.0 50.0 35.0", ...]
|
| 203 |
+
}
|
| 204 |
+
```
|
| 205 |
+
- **Reproducibility**: Use `seed + i` for each region to maintain order consistency
|
| 206 |
+
|
| 207 |
+
#### **Stage 08: Extract Visual Element Definitions**
|
| 208 |
+
- **Original**: `pipeline_08_extract_visual_element_definitions.py`
|
| 209 |
+
- **API**: `process_stage3_complete()` section in `api/utils.py:1237-1275`
|
| 210 |
+
- **Process**:
|
| 211 |
+
1. Filter geometries: `"visual_element" in geo['selectorTypes']`
|
| 212 |
+
2. Parse attributes:
|
| 213 |
+
- `data-placeholder`: Element type (logo, photo, figure, chart, barcode, etc.)
|
| 214 |
+
- `data-content`: Semantic description (e.g., "company-logo", "product-photo")
|
| 215 |
+
3. Normalize types using synonyms:
|
| 216 |
+
- "chart" → "figure"
|
| 217 |
+
- "image" → "photo"
|
| 218 |
+
4. Filter by `visual_element_types` parameter (if specified)
|
| 219 |
+
5. Convert coordinates: pixels (96 DPI) → mm
|
| 220 |
+
6. Extract rotation from CSS `transform: rotate(Xdeg)`
|
| 221 |
+
7. Build visual element definitions:
|
| 222 |
+
```python
|
| 223 |
+
{
|
| 224 |
+
"id": "ve0",
|
| 225 |
+
"type": "logo", # normalized
|
| 226 |
+
"content": "company-logo",
|
| 227 |
+
"rect": {x, y, width, height}, # in mm
|
| 228 |
+
"rotation": 0 # degrees
|
| 229 |
+
}
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
#### **Stage 09: Create Handwriting Images**
|
| 233 |
+
- **Original**: `pipeline_09_create_handwriting_images.py`
|
| 234 |
+
- **API**: `call_handwriting_service_batch()` in `api/utils.py:785-920`
|
| 235 |
+
- **Handwriting Service**: RunPod serverless endpoint hosting WordStylist diffusion model
|
| 236 |
+
- **Service Implementation**: `handwriting_service/handler.py`, `handwriting_service/inference.py`
|
| 237 |
+
|
| 238 |
+
**🔄 Handwriting Service Integration Details:**
|
| 239 |
+
|
| 240 |
+
##### **Service Architecture**
|
| 241 |
+
- **Platform**: RunPod Serverless (GPU: NVIDIA A4000, Cost: ~$0.00025/s active)
|
| 242 |
+
- **Model**: WordStylist (Diffusion-based handwriting synthesis)
|
| 243 |
+
- Architecture: UNet with conditional style embeddings
|
| 244 |
+
- Input: Text (A-Z, a-z only, no spaces), Writer style ID (0-656)
|
| 245 |
+
- Output: PNG image with transparent background
|
| 246 |
+
- Inference time: ~18s per text on A4000
|
| 247 |
+
- Weights: `handwriting_service/WordStylist/models/`
|
| 248 |
+
- **Endpoints**:
|
| 249 |
+
- `/run` (async): Queue job, return ID, poll `/status/{id}` (10MB limit)
|
| 250 |
+
- `/runsync` (sync): Wait for completion, return result (20MB limit, used by API)
|
| 251 |
+
|
| 252 |
+
##### **Batch Processing (Cost Optimization)**
|
| 253 |
+
The API uses TRUE batch processing to minimize RunPod activation overhead:
|
| 254 |
+
|
| 255 |
+
```python
|
| 256 |
+
# ✅ NEW: Batch all texts in ONE request
|
| 257 |
+
runpod_request = {
|
| 258 |
+
"input": {
|
| 259 |
+
"texts": [
|
| 260 |
+
{"text": "Hello", "author_id": 42, "hw_id": "hw0_b0_l0_w0"},
|
| 261 |
+
{"text": "World", "author_id": 42, "hw_id": "hw0_b0_l0_w1"},
|
| 262 |
+
# ... 10-100 texts
|
| 263 |
+
],
|
| 264 |
+
"apply_blur": True
|
| 265 |
+
}
|
| 266 |
+
}
|
| 267 |
+
# Result: 1 worker activation × (N × 18s) = ~40-60% cost savings
|
| 268 |
+
```
|
| 269 |
+
|
| 270 |
+
**Cost Comparison for 10 texts:**
|
| 271 |
+
- ❌ OLD (parallel): 10 workers × 18s = 180 worker-seconds + 10× activation fee
|
| 272 |
+
- ✅ NEW (batched): 1 worker × 190s = 190 worker-seconds + 1× activation fee
|
| 273 |
+
|
| 274 |
+
##### **API Processing Flow**
|
| 275 |
+
1. **Group by region and line**: Split handwriting regions into word-level requests
|
| 276 |
+
```python
|
| 277 |
+
# Text: "Patient Name" → 2 word-level generations
|
| 278 |
+
texts_to_generate = [
|
| 279 |
+
{"text": "Patient", "author_id": 42, "hw_id": "hw0_b0_l0_w0"},
|
| 280 |
+
{"text": "Name", "author_id": 42, "hw_id": "hw0_b0_l0_w1"}
|
| 281 |
+
]
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
2. **Map author IDs to numeric styles**:
|
| 285 |
+
```python
|
| 286 |
+
# "author1" → WRITER_STYLES[1] = 42 (deterministic)
|
| 287 |
+
# "author2" → WRITER_STYLES[2] = 137
|
| 288 |
+
# 657 total writer styles available
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
3. **Sanitize text** (WordStylist constraint):
|
| 292 |
+
```python
|
| 293 |
+
# Only A-Z, a-z allowed (no spaces, numbers, punctuation)
|
| 294 |
+
"Hello123!" → "Hello"
|
| 295 |
+
"first-name" → "firstname"
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
+
4. **Send batch request** to RunPod `/runsync` endpoint:
|
| 299 |
+
```python
|
| 300 |
+
POST https://api.runpod.ai/v2/{endpoint_id}/runsync
|
| 301 |
+
Authorization: Bearer {RUNPOD_API_KEY}
|
| 302 |
+
Content-Type: application/json
|
| 303 |
+
|
| 304 |
+
{
|
| 305 |
+
"input": {
|
| 306 |
+
"texts": [...],
|
| 307 |
+
"apply_blur": True # Gaussian blur for realism
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
5. **Handle async responses**:
|
| 313 |
+
- If `status: "IN_PROGRESS"`: Poll `/status/{job_id}` every 5-10s (max 30 polls)
|
| 314 |
+
- If `status: "COMPLETED"`: Extract `output.images[]`
|
| 315 |
+
- If `status: "FAILED"`: Raise exception (stops entire generation)
|
| 316 |
+
|
| 317 |
+
6. **Response format**:
|
| 318 |
+
```python
|
| 319 |
+
{
|
| 320 |
+
"status": "COMPLETED",
|
| 321 |
+
"output": {
|
| 322 |
+
"images": [
|
| 323 |
+
{
|
| 324 |
+
"image_base64": "iVBORw0KGgoAAAANSU...",
|
| 325 |
+
"width": 200,
|
| 326 |
+
"height": 64,
|
| 327 |
+
"text": "Patient",
|
| 328 |
+
"author_id": 42,
|
| 329 |
+
"hw_id": "hw0_b0_l0_w0"
|
| 330 |
+
},
|
| 331 |
+
...
|
| 332 |
+
],
|
| 333 |
+
"total_generated": 2
|
| 334 |
+
}
|
| 335 |
+
}
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
7. **Store generated images**: Map `hw_id → image_base64` for insertion
|
| 339 |
+
|
| 340 |
+
##### **Error Handling**
|
| 341 |
+
- **Retry logic**: 3 attempts with exponential backoff (matching seed download)
|
| 342 |
+
- **Timeouts**: Dynamic based on batch size: `20s × num_texts + 30s buffer`
|
| 343 |
+
- **Failure behavior**: **RAISE EXCEPTION** (since session fix)
|
| 344 |
+
- ❌ OLD: Silent continue → Documents without handwriting
|
| 345 |
+
- ✅ NEW: Raise exception → Generation fails when user requested handwriting
|
| 346 |
+
|
| 347 |
+
##### **Service Code Structure**
|
| 348 |
+
**`handwriting_service/handler.py`** (RunPod handler):
|
| 349 |
+
```python
|
| 350 |
+
# Initialize model ONCE at module level (not per request)
|
| 351 |
+
generator = HandwritingGenerator(
|
| 352 |
+
model_dir="WordStylist",
|
| 353 |
+
checkpoint_path="WordStylist/models",
|
| 354 |
+
device="cuda"
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
def handler(job):
|
| 358 |
+
"""RunPod entry point - supports both /run and /runsync"""
|
| 359 |
+
texts = job["input"]["texts"] # Batch input
|
| 360 |
+
results = generator.generate_batch(
|
| 361 |
+
texts=[t["text"] for t in texts],
|
| 362 |
+
author_ids=[t["author_id"] for t in texts],
|
| 363 |
+
num_inference_steps=50,
|
| 364 |
+
temperature=1.0,
|
| 365 |
+
apply_blur=True
|
| 366 |
+
)
|
| 367 |
+
return {"images": results, "total_generated": len(results)}
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
**`handwriting_service/inference.py`** (WordStylist wrapper):
|
| 371 |
+
```python
|
| 372 |
+
class HandwritingGenerator:
|
| 373 |
+
def generate_batch(self, texts, author_ids, ...):
|
| 374 |
+
results = []
|
| 375 |
+
for text, author_id in zip(texts, author_ids):
|
| 376 |
+
# Load model checkpoint
|
| 377 |
+
unet = Unet(...)
|
| 378 |
+
unet.load_state_dict(checkpoint)
|
| 379 |
+
|
| 380 |
+
# Prepare style condition
|
| 381 |
+
style_id_tensor = torch.tensor([author_id])
|
| 382 |
+
|
| 383 |
+
# Diffusion reverse process (50 steps)
|
| 384 |
+
img = self.sample(unet, style_id_tensor, text_length=len(text))
|
| 385 |
+
|
| 386 |
+
# Post-process: crop, resize, apply blur
|
| 387 |
+
img_pil = postprocess_image(img)
|
| 388 |
+
if apply_blur:
|
| 389 |
+
img_pil = img_pil.filter(ImageFilter.GaussianBlur(1.2))
|
| 390 |
+
|
| 391 |
+
# Encode to base64
|
| 392 |
+
img_base64 = encode_pil_to_base64(img_pil)
|
| 393 |
+
results.append({
|
| 394 |
+
"image_base64": img_base64,
|
| 395 |
+
"width": img_pil.width,
|
| 396 |
+
"height": img_pil.height
|
| 397 |
+
})
|
| 398 |
+
|
| 399 |
+
return results
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
+
#### **Stage 10: Create Visual Element Images**
|
| 403 |
+
- **Original**: `pipeline_10_create_visual_elements.py`
|
| 404 |
+
- **API**: `generate_visual_element_images()` in `api/utils.py:925-1020`
|
| 405 |
+
- **Process**:
|
| 406 |
+
1. Load prefab images from `data/visual_element_prefabs/{type}/`:
|
| 407 |
+
- `logo/`: Company logos (50+ SVGs)
|
| 408 |
+
- `photo/`: Stock photos (100+ JPGs)
|
| 409 |
+
- `figure/`: Charts, graphs (30+ PNGs)
|
| 410 |
+
- `barcode/`: Generated barcodes
|
| 411 |
+
- `qr_code/`, `stamp/`, `signature/`, `checkbox/`, etc.
|
| 412 |
+
2. **Random selection** (seed-based if provided):
|
| 413 |
+
```python
|
| 414 |
+
if seed is not None:
|
| 415 |
+
random.seed(seed)
|
| 416 |
+
prefab_path = random.choice(list(prefab_dir.glob("*")))
|
| 417 |
+
```
|
| 418 |
+
3. **Special handling**:
|
| 419 |
+
- **Barcode**: Generate on-the-fly using `python-barcode` library
|
| 420 |
+
```python
|
| 421 |
+
# Generate random EAN-13 barcode (12 digits + checksum)
|
| 422 |
+
barcode_num = random.randint(100000000000, 999999999999)
|
| 423 |
+
barcode = EAN13(str(barcode_num), writer=ImageWriter())
|
| 424 |
+
```
|
| 425 |
+
- **QR Code**: Generate using `qrcode` library
|
| 426 |
+
- **Checkbox**: Render checked/unchecked SVG
|
| 427 |
+
4. Load and convert to base64:
|
| 428 |
+
```python
|
| 429 |
+
with open(prefab_path, 'rb') as f:
|
| 430 |
+
img_bytes = f.read()
|
| 431 |
+
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
| 432 |
+
```
|
| 433 |
+
5. Return mapping: `ve_id → image_base64`
|
| 434 |
+
|
| 435 |
+
#### **Stage 11: Make Text Transparent (Implicit)**
|
| 436 |
+
- **Original**: `pipeline_11_make_text_transparent.py`
|
| 437 |
+
- **API**: Implemented as "whiteout" in `process_stage3_complete()` at `api/utils.py:1415-1427`
|
| 438 |
+
- **Process**:
|
| 439 |
+
```python
|
| 440 |
+
# Draw white rectangles over original text to hide it
|
| 441 |
+
for hw_region in handwriting_regions:
|
| 442 |
+
for bbox_str in hw_region['bboxes']:
|
| 443 |
+
bbox = parse_bbox(bbox_str)
|
| 444 |
+
rect = fitz.Rect(bbox.x0, bbox.y0, bbox.x2, bbox.y2)
|
| 445 |
+
page.draw_rect(rect, color=(1,1,1), fill=(1,1,1)) # White fill
|
| 446 |
+
```
|
| 447 |
+
- **Why not transparent?**: PyMuPDF doesn't support making existing text transparent, so we use white rectangles instead (same visual result)
|
| 448 |
+
|
| 449 |
+
#### **Stage 12: Insert Handwriting Images**
|
| 450 |
+
- **Original**: `pipeline_12_insert_handwriting_images.py`
|
| 451 |
+
- **API**: `process_stage3_complete()` section in `api/utils.py:1429-1520`
|
| 452 |
+
- **Process**:
|
| 453 |
+
1. **Position calculation**:
|
| 454 |
+
```python
|
| 455 |
+
# Get word bbox from PDF extraction
|
| 456 |
+
bbox_w = bbox.x2 - bbox.x0 # Width in points
|
| 457 |
+
bbox_h = bbox.y2 - bbox.y0 # Height in points
|
| 458 |
+
|
| 459 |
+
# Resize handwriting image with aspect ratio
|
| 460 |
+
scale = min(bbox_w / img_width, bbox_h / img_height)
|
| 461 |
+
new_w = int(img_width * scale * SCALE_UP_FACTOR) # 3x upscale
|
| 462 |
+
new_h = int(img_height * scale * SCALE_UP_FACTOR)
|
| 463 |
+
|
| 464 |
+
# Add random offsets for natural variation
|
| 465 |
+
offset_x = random.randint(-MAX_OFFSET_LEFT, MAX_OFFSET_RIGHT) + FIXED_OFFSET
|
| 466 |
+
offset_y = random.randint(-MAX_OFFSET_UP, MAX_OFFSET_DOWN)
|
| 467 |
+
|
| 468 |
+
# Position at bbox coordinates
|
| 469 |
+
x0 = bbox.x0 + offset_x
|
| 470 |
+
y0 = bbox.y0 + offset_y - y_padding
|
| 471 |
+
```
|
| 472 |
+
|
| 473 |
+
2. **Insert into PDF**:
|
| 474 |
+
```python
|
| 475 |
+
img_resized = img.resize((new_w, new_h), Image.LANCZOS).convert("RGBA")
|
| 476 |
+
img_bytes = pil_to_bytes(img_resized)
|
| 477 |
+
rect = fitz.Rect(x0, y0, x0 + bbox_w, y0 + bbox_h)
|
| 478 |
+
page.insert_image(rect, stream=img_bytes)
|
| 479 |
+
```
|
| 480 |
+
|
| 481 |
+
3. Save intermediate PDF: `{doc_id}_with_handwriting.pdf`
|
| 482 |
+
|
| 483 |
+
#### **Stage 13: Insert Visual Elements**
|
| 484 |
+
- **Original**: `pipeline_13_insert_visual_elements.py`
|
| 485 |
+
- **API**: `process_stage3_complete()` section in `api/utils.py:1523-1625`
|
| 486 |
+
- **Process**:
|
| 487 |
+
1. Convert mm → points: `mm_to_pt = 72 / 25.4`
|
| 488 |
+
2. Resize with aspect ratio preservation (same as handwriting)
|
| 489 |
+
3. Center image on white background (maintains bbox size)
|
| 490 |
+
4. Insert into PDF at geometry coordinates
|
| 491 |
+
5. Save final PDF: `{doc_id}_final.pdf` (includes both handwriting + visual elements)
|
| 492 |
+
|
| 493 |
+
---
|
| 494 |
+
|
| 495 |
+
### **Phase 3: Image Finalization & OCR (Stages 14-15)**
|
| 496 |
+
Convert final PDF to high-resolution image and extract OCR data.
|
| 497 |
+
|
| 498 |
+
#### **Stage 14: Render Image**
|
| 499 |
+
- **Original**: `pipeline_14_render_image.py`
|
| 500 |
+
- **API**: `process_stage4_ocr()` in `api/utils.py:1899-1940`
|
| 501 |
+
- **Process**:
|
| 502 |
+
```python
|
| 503 |
+
# Render PDF page to high-res PNG
|
| 504 |
+
page = fitz.open(pdf_path)[0]
|
| 505 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(3, 3)) # 3x scale = ~220 DPI
|
| 506 |
+
img_bytes = pix.tobytes("png")
|
| 507 |
+
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
| 508 |
+
```
|
| 509 |
+
- **Output**: Base64-encoded PNG at 220 DPI (configurable via scale factor)
|
| 510 |
+
|
| 511 |
+
#### **Stage 15: Perform OCR**
|
| 512 |
+
- **Original**: `pipeline_15_perform_ocr.py`
|
| 513 |
+
- **API**: `run_paddle_ocr()` in `api/utils.py:1950-2080`
|
| 514 |
+
- **OCR Engine**: PaddleOCR v4 (multilingual)
|
| 515 |
+
- Models: `PP-OCRv4` detection + recognition
|
| 516 |
+
- Languages: Supports 80+ languages
|
| 517 |
+
- Accuracy: State-of-the-art open-source OCR
|
| 518 |
+
- **Process**:
|
| 519 |
+
1. Render PDF to image via `pdf2image` at specified DPI (default: 300)
|
| 520 |
+
2. Initialize PaddleOCR with language parameter
|
| 521 |
+
3. Run detection + recognition:
|
| 522 |
+
```python
|
| 523 |
+
ocr = PaddleOCR(lang=language, use_gpu=True)
|
| 524 |
+
results = ocr.ocr(img_array, cls=True)
|
| 525 |
+
```
|
| 526 |
+
4. Parse results into word-level bboxes:
|
| 527 |
+
```python
|
| 528 |
+
{
|
| 529 |
+
"text": "word",
|
| 530 |
+
"bbox": {
|
| 531 |
+
"x0": float,
|
| 532 |
+
"y0": float,
|
| 533 |
+
"x1": float, # right
|
| 534 |
+
"y1": float # bottom
|
| 535 |
+
},
|
| 536 |
+
"confidence": 0.95
|
| 537 |
+
}
|
| 538 |
+
```
|
| 539 |
+
- **Output**: Dictionary with `words` list, image dimensions, OCR engine info
|
| 540 |
+
|
| 541 |
+
---
|
| 542 |
+
|
| 543 |
+
### **Phase 4: Dataset Packaging (Stages 16-19)**
|
| 544 |
+
Normalize, verify, analyze, and export final dataset.
|
| 545 |
+
|
| 546 |
+
#### **Stage 16: Normalize Bboxes**
|
| 547 |
+
- **Original**: `pipeline_16_normalize_bboxes.py`
|
| 548 |
+
- **API**: `normalize_bboxes()` in `api/utils.py:2100-2180`
|
| 549 |
+
- **Process**:
|
| 550 |
+
1. Convert absolute pixel coordinates → normalized [0, 1] range:
|
| 551 |
+
```python
|
| 552 |
+
norm_bbox = [
|
| 553 |
+
bbox['x0'] / img_width,
|
| 554 |
+
bbox['y0'] / img_height,
|
| 555 |
+
bbox['x1'] / img_width,
|
| 556 |
+
bbox['y1'] / img_height
|
| 557 |
+
]
|
| 558 |
+
```
|
| 559 |
+
2. Clip to [0, 1]: `[max(0, min(1, x)) for x in norm_bbox]`
|
| 560 |
+
3. Create word-level and segment-level bboxes
|
| 561 |
+
- **Output**: List of `{text, bbox: [x0, y0, x1, y1]}` where bbox is normalized
|
| 562 |
+
|
| 563 |
+
#### **Stage 17: Ground Truth Verification**
|
| 564 |
+
- **Original**: `pipeline_17_gt_preparation_verification.py`
|
| 565 |
+
- **API**: `verify_ground_truth()` in `api/utils.py:2185-2250`
|
| 566 |
+
- **Checks**:
|
| 567 |
+
- GT structure: Valid JSON, required fields
|
| 568 |
+
- Text matching: GT text exists in OCR output
|
| 569 |
+
- Bbox coverage: GT answers have corresponding bboxes
|
| 570 |
+
- **Output**: Verification report with pass/fail status
|
| 571 |
+
|
| 572 |
+
#### **Stage 18: Analyze**
|
| 573 |
+
- **Original**: `pipeline_18_analyze.py`
|
| 574 |
+
- **API**: `analyze_document()` in `api/utils.py:2255-2320`
|
| 575 |
+
- **Metrics**:
|
| 576 |
+
- Word count, character count
|
| 577 |
+
- Average word length
|
| 578 |
+
- Handwriting regions count, coverage %
|
| 579 |
+
- Visual elements count by type
|
| 580 |
+
- OCR confidence statistics (mean, min, max)
|
| 581 |
+
- **Output**: Analysis dictionary with computed metrics
|
| 582 |
+
|
| 583 |
+
#### **Stage 19: Create Debug Data & Export**
|
| 584 |
+
- **Original**: `pipeline_19_create_debug_data.py`
|
| 585 |
+
- **API**: `export_to_msgpack()` in `api/utils.py:2350-2520`
|
| 586 |
+
- **Debug Visualization**:
|
| 587 |
+
- Draw bboxes on image with different colors:
|
| 588 |
+
- Green: Word bboxes
|
| 589 |
+
- Red: Handwriting regions
|
| 590 |
+
- Blue: Visual elements
|
| 591 |
+
- Yellow: Ground truth target regions
|
| 592 |
+
- Save annotated image
|
| 593 |
+
- **Dataset Export (msgpack)**:
|
| 594 |
+
```python
|
| 595 |
+
dataset_entry = {
|
| 596 |
+
"image": img_bytes, # PNG bytes
|
| 597 |
+
"words": ["hello", "world"],
|
| 598 |
+
"word_bboxes": [[0.1, 0.2, 0.15, 0.25], ...], # Normalized
|
| 599 |
+
"segment_bboxes": [...],
|
| 600 |
+
"ground_truth": {"question": "answer"},
|
| 601 |
+
"metadata": {
|
| 602 |
+
"document_id": "...",
|
| 603 |
+
"has_handwriting": True,
|
| 604 |
+
"num_visual_elements": 3
|
| 605 |
+
}
|
| 606 |
+
}
|
| 607 |
+
msgpack.dump(dataset_entry, f)
|
| 608 |
+
```
|
| 609 |
+
- **Output**: `.msgpack` file compatible with PyTorch DataLoader
|
| 610 |
+
|
| 611 |
+
---
|
| 612 |
+
|
| 613 |
+
## Pipeline Verification: API vs Original Implementation
|
| 614 |
+
|
| 615 |
+
### ✅ **Stage-by-Stage Mapping**
|
| 616 |
+
|
| 617 |
+
| Stage | Original File | API Function | Status |
|
| 618 |
+
|-------|--------------|--------------|--------|
|
| 619 |
+
| 01 | `pipeline_01_select_seeds.py` | `download_seed_images()` | ✅ Mapped (with retry logic) |
|
| 620 |
+
| 02 | `pipeline_02_prompt_llm.py` | `call_claude_api_direct()` | ✅ Mapped (uses Messages API) |
|
| 621 |
+
| 03 | `pipeline_03_process_response.py` | `extract_html_documents_from_response()` | ✅ Mapped |
|
| 622 |
+
| 04 | `pipeline_04_render_pdf_and_extract_geos.py` | `render_html_to_pdf()` | ✅ Mapped (Playwright) |
|
| 623 |
+
| 05 | `pipeline_05_extract_bboxes_from_pdf.py` | `extract_bboxes_from_rendered_pdf()` | ✅ Mapped |
|
| 624 |
+
| 06 | `pipeline_06_validation.py` | `validate_html_structure()`, `validate_pdf()` | ✅ Mapped |
|
| 625 |
+
| 07 | `pipeline_07_extract_handwriting.py` | `process_stage3_complete()` section | ✅ Mapped (with ratio filter) |
|
| 626 |
+
| 08 | `pipeline_08_extract_visual_element_definitions.py` | `process_stage3_complete()` section | ✅ Mapped |
|
| 627 |
+
| 09 | `pipeline_09_create_handwriting_images.py` | `call_handwriting_service_batch()` | ✅ Mapped (RunPod integration) |
|
| 628 |
+
| 10 | `pipeline_10_create_visual_elements.py` | `generate_visual_element_images()` | ✅ Mapped |
|
| 629 |
+
| 11 | `pipeline_11_make_text_transparent.py` | `process_stage3_complete()` (whiteout) | ✅ Mapped (white rectangles) |
|
| 630 |
+
| 12 | `pipeline_12_insert_handwriting_images.py` | `process_stage3_complete()` section | ✅ Mapped |
|
| 631 |
+
| 13 | `pipeline_13_insert_visual_elements.py` | `process_stage3_complete()` section | ✅ Mapped |
|
| 632 |
+
| 14 | `pipeline_14_render_image.py` | `process_stage4_ocr()` | ✅ Mapped |
|
| 633 |
+
| 15 | `pipeline_15_perform_ocr.py` | `run_paddle_ocr()` | ✅ Mapped |
|
| 634 |
+
| 16 | `pipeline_16_normalize_bboxes.py` | `normalize_bboxes()` | ✅ Mapped |
|
| 635 |
+
| 17 | `pipeline_17_gt_preparation_verification.py` | `verify_ground_truth()` | ✅ Mapped |
|
| 636 |
+
| 18 | `pipeline_18_analyze.py` | `analyze_document()` | ✅ Mapped |
|
| 637 |
+
| 19 | `pipeline_19_create_debug_data.py` | `export_to_msgpack()` | ✅ Mapped |
|
| 638 |
+
|
| 639 |
+
### 📊 **Key Differences: API vs Batch Pipeline**
|
| 640 |
+
|
| 641 |
+
#### **Processing Model**
|
| 642 |
+
- **Original**: Batch processing with file-based state management
|
| 643 |
+
- Input: CSV of seed selections, prompt parameters in JSON
|
| 644 |
+
- Output: Folder structure with intermediate files
|
| 645 |
+
- State: JSON logs per document + message
|
| 646 |
+
- Resumability: Can restart from any stage
|
| 647 |
+
|
| 648 |
+
- **API**: Request/response with in-memory processing
|
| 649 |
+
- Input: JSON request with seed URLs
|
| 650 |
+
- Output: JSON response or ZIP file
|
| 651 |
+
- State: Ephemeral (temporary directories)
|
| 652 |
+
- Resumability: None (single-shot generation)
|
| 653 |
+
|
| 654 |
+
#### **Handwriting Generation**
|
| 655 |
+
- **Original**: Local GPU with WordStylist model loaded in-process
|
| 656 |
+
- Location: `docgenie/generation/handwriting_diffusion/`
|
| 657 |
+
- Execution: `generate_handwriting_diffusion_raw.py`
|
| 658 |
+
- Cost: Free (local GPU)
|
| 659 |
+
|
| 660 |
+
- **API**: Remote RunPod serverless endpoint
|
| 661 |
+
- Location: `handwriting_service/` (deployed separately)
|
| 662 |
+
- Execution: HTTP POST to RunPod API
|
| 663 |
+
- Cost: ~$0.00025/s GPU time (pay-per-use)
|
| 664 |
+
- Benefit: No local GPU required, scales automatically
|
| 665 |
+
|
| 666 |
+
#### **Seed Selection**
|
| 667 |
+
- **Original**: Pre-crawled dataset with systematic selection
|
| 668 |
+
- Seeds stored in: `data/datasets/base_v2/`
|
| 669 |
+
- Selection: Clustering algorithm → balanced subset
|
| 670 |
+
- Tracking: CSV manifest with seed IDs
|
| 671 |
+
|
| 672 |
+
- **API**: User-provided URLs
|
| 673 |
+
- Seeds: Any publicly accessible image URL
|
| 674 |
+
- Selection: User chooses 1-8 images per request
|
| 675 |
+
- Tracking: URLs stored in request metadata
|
| 676 |
+
|
| 677 |
+
#### **Prompt Templates**
|
| 678 |
+
- **Original**: Multiple template versions in folders
|
| 679 |
+
- Path: `data/prompt_templates/{version}/seed-based-json.txt`
|
| 680 |
+
- Versioning: ClaudeRefined1 → ClaudeRefined12
|
| 681 |
+
- Selection: Configurable per dataset
|
| 682 |
+
|
| 683 |
+
- **API**: Fixed template (latest version)
|
| 684 |
+
- Path: `data/prompt_templates/ClaudeRefined12/seed-based-json.txt`
|
| 685 |
+
- Hardcoded in: `api/main.py:171`
|
| 686 |
+
- **Future improvement**: Make template selectable via API parameter
|
| 687 |
+
|
| 688 |
+
---
|
| 689 |
+
|
| 690 |
+
## Complete Request Flow Example
|
| 691 |
+
|
| 692 |
+
### Example Request (Sync Endpoint)
|
| 693 |
+
```bash
|
| 694 |
+
POST /generate/pdf HTTP/1.1
|
| 695 |
+
Content-Type: application/json
|
| 696 |
+
|
| 697 |
+
{
|
| 698 |
+
"seed_images": [
|
| 699 |
+
"https://example.com/seed1.jpg",
|
| 700 |
+
"https://example.com/seed2.jpg"
|
| 701 |
+
],
|
| 702 |
+
"prompt_params": {
|
| 703 |
+
"language": "english",
|
| 704 |
+
"doc_type": "medical_form",
|
| 705 |
+
"gt_type": "kie",
|
| 706 |
+
"gt_format": "json",
|
| 707 |
+
"num_solutions": 2,
|
| 708 |
+
"enable_handwriting": true,
|
| 709 |
+
"handwriting_ratio": 0.3,
|
| 710 |
+
"enable_visual_elements": true,
|
| 711 |
+
"visual_element_types": ["logo", "signature"],
|
| 712 |
+
"enable_ocr": true,
|
| 713 |
+
"enable_dataset_export": true,
|
| 714 |
+
"seed": 42
|
| 715 |
+
}
|
| 716 |
+
}
|
| 717 |
+
```
|
| 718 |
+
|
| 719 |
+
### Processing Flow (Stages Executed)
|
| 720 |
+
|
| 721 |
+
**Phase 1: Core Document Generation (30-60s)**
|
| 722 |
+
1. ✅ Download 2 seed images with retry → `[img1_b64, img2_b64]`
|
| 723 |
+
2. ✅ Load prompt template → Build prompt for medical_form + KIE
|
| 724 |
+
3. ✅ Call Claude API → LLM generates 2 HTML documents (~25s)
|
| 725 |
+
4. ✅ Extract HTML + ground truth → 2 clean HTML files with GT JSON
|
| 726 |
+
5. ✅ Render each HTML to PDF via Playwright → 2 PDFs + geometries
|
| 727 |
+
6. ✅ Extract word bboxes from PDFs → ~200-500 words per document
|
| 728 |
+
|
| 729 |
+
**Phase 2: Feature Synthesis (120-180s if handwriting enabled)**
|
| 730 |
+
7. ✅ Parse geometries for handwriting markers
|
| 731 |
+
- Found: 12 elements with `class="handwritten"`
|
| 732 |
+
- Filtered by ratio: 12 × 0.3 = ~4 elements selected (probabilistic)
|
| 733 |
+
- Matched to word bboxes: 4 regions with 15 total words
|
| 734 |
+
8. ✅ Parse geometries for visual elements
|
| 735 |
+
- Found: 3 elements (`data-placeholder="logo"`, `"signature"`, `"logo"`)
|
| 736 |
+
- Filtered by types: Keep logo + signature, remove others
|
| 737 |
+
- Result: 2 visual element definitions
|
| 738 |
+
9. ✅ Generate handwriting images via RunPod
|
| 739 |
+
- **Batch request**: 15 words in ONE API call
|
| 740 |
+
- Map author IDs: `author1 → style 42`, `author2 → style 137`
|
| 741 |
+
- RunPod processing: 1 worker × (15 × 18s) = ~270s
|
| 742 |
+
- Result: 15 PNG images (base64-encoded)
|
| 743 |
+
10. ✅ Generate visual element images
|
| 744 |
+
- Logo: Random selection from `data/visual_element_prefabs/logo/` (seed=42)
|
| 745 |
+
- Signature: Generate on-the-fly using signature prefab
|
| 746 |
+
- Result: 2 PNG images
|
| 747 |
+
11. ✅ Whiteout original text: Draw white rectangles over 15 word positions
|
| 748 |
+
12. ✅ Insert handwriting: Place 15 generated images at word bboxes with offsets
|
| 749 |
+
- Save: `doc1_with_handwriting.pdf`, `doc2_with_handwriting.pdf`
|
| 750 |
+
13. ✅ Insert visual elements: Place logo + signature at geometry coords
|
| 751 |
+
- Save: `doc1_final.pdf`, `doc2_final.pdf`
|
| 752 |
+
|
| 753 |
+
**Phase 3: Image + OCR (5-10s)**
|
| 754 |
+
14. ✅ Render each final PDF to 220 DPI image → 2 PNG files (base64)
|
| 755 |
+
15. ✅ Run PaddleOCR on each image
|
| 756 |
+
- Doc1: Detected 187 words, avg confidence 0.91
|
| 757 |
+
- Doc2: Detected 203 words, avg confidence 0.94
|
| 758 |
+
|
| 759 |
+
**Phase 4: Dataset Packaging (2-5s)**
|
| 760 |
+
16. ✅ Normalize OCR bboxes: Convert pixels → [0,1] range
|
| 761 |
+
17. ✅ Verify ground truth: Check GT fields match OCR output (enabled=false, skipped)
|
| 762 |
+
18. ✅ Analyze documents: Compute metrics (enabled=false, skipped)
|
| 763 |
+
19. ✅ Export to msgpack:
|
| 764 |
+
- Doc1: Pack image + words + normalized bboxes + GT → `doc1.msgpack`
|
| 765 |
+
- Doc2: Pack image + words + normalized bboxes + GT → `doc2.msgpack`
|
| 766 |
+
|
| 767 |
+
**Final Output: ZIP File Contents**
|
| 768 |
+
```
|
| 769 |
+
dataset.zip
|
| 770 |
+
├── doc1_uuid_0.pdf # Original rendered PDF
|
| 771 |
+
├── doc1_uuid_0_final.pdf # PDF with handwriting + visual elements
|
| 772 |
+
├── doc1_uuid_0.msgpack # Dataset format
|
| 773 |
+
├── doc2_uuid_1.pdf
|
| 774 |
+
├── doc2_uuid_1_final.pdf
|
| 775 |
+
├── doc2_uuid_1.msgpack
|
| 776 |
+
├── metadata.json # Complete generation metadata
|
| 777 |
+
└── handwriting/
|
| 778 |
+
├── hw0_b0_l0_w0.png # Individual handwriting images
|
| 779 |
+
├── hw0_b0_l0_w1.png
|
| 780 |
+
└── ... (13 more)
|
| 781 |
+
```
|
| 782 |
+
|
| 783 |
+
### Response (JSON Metadata)
|
| 784 |
+
```json
|
| 785 |
+
{
|
| 786 |
+
"task_id": "uuid-here",
|
| 787 |
+
"status": "completed",
|
| 788 |
+
"num_documents": 2,
|
| 789 |
+
"processing_time_seconds": 305.7,
|
| 790 |
+
"stages_completed": [
|
| 791 |
+
"seed_download", "llm_prompt", "html_extraction",
|
| 792 |
+
"pdf_render", "bbox_extraction", "handwriting_extraction",
|
| 793 |
+
"visual_element_extraction", "handwriting_generation",
|
| 794 |
+
"visual_element_generation", "handwriting_insertion",
|
| 795 |
+
"visual_element_insertion", "image_render", "ocr",
|
| 796 |
+
"bbox_normalization", "dataset_export"
|
| 797 |
+
],
|
| 798 |
+
"documents": [
|
| 799 |
+
{
|
| 800 |
+
"document_id": "doc1_uuid_0",
|
| 801 |
+
"ground_truth": {"patient_name": "John Doe", "date": "2024-01-15"},
|
| 802 |
+
"num_words": 187,
|
| 803 |
+
"num_handwriting_regions": 2,
|
| 804 |
+
"num_visual_elements": 2,
|
| 805 |
+
"ocr_confidence_avg": 0.91
|
| 806 |
+
},
|
| 807 |
+
{
|
| 808 |
+
"document_id": "doc2_uuid_1",
|
| 809 |
+
"ground_truth": {"patient_name": "Jane Smith", "date": "2024-01-16"},
|
| 810 |
+
"num_words": 203,
|
| 811 |
+
"num_handwriting_regions": 2,
|
| 812 |
+
"num_visual_elements": 2,
|
| 813 |
+
"ocr_confidence_avg": 0.94
|
| 814 |
+
}
|
| 815 |
+
],
|
| 816 |
+
"download_url": "/download/dataset_uuid.zip"
|
| 817 |
+
}
|
| 818 |
+
```
|
| 819 |
+
|
| 820 |
+
---
|
| 821 |
+
|
| 822 |
+
## Configuration & Environment
|
| 823 |
+
|
| 824 |
+
### Required Environment Variables
|
| 825 |
+
```bash
|
| 826 |
+
# LLM API
|
| 827 |
+
ANTHROPIC_API_KEY=sk-ant-... # Claude API key
|
| 828 |
+
CLAUDE_MODEL=claude-3-5-sonnet-20241022 # Default model
|
| 829 |
+
|
| 830 |
+
# Handwriting Service (RunPod)
|
| 831 |
+
HANDWRITING_SERVICE_ENABLED=true
|
| 832 |
+
HANDWRITING_SERVICE_URL=https://api.runpod.ai/v2/{endpoint_id}/runsync
|
| 833 |
+
RUNPOD_API_KEY=... # RunPod API key
|
| 834 |
+
HANDWRITING_APPLY_BLUR=true # Gaussian blur for realism
|
| 835 |
+
HANDWRITING_SERVICE_MAX_RETRIES=3
|
| 836 |
+
HANDWRITING_SERVICE_TIMEOUT=600 # 10 minutes for large batches
|
| 837 |
+
|
| 838 |
+
# OCR Configuration
|
| 839 |
+
OCR_DPI=300 # Image resolution for OCR
|
| 840 |
+
OCR_LANGUAGE=en # PaddleOCR language code
|
| 841 |
+
|
| 842 |
+
# File Paths
|
| 843 |
+
PROMPT_TEMPLATES_DIR=/path/to/data/prompt_templates
|
| 844 |
+
VISUAL_ELEMENT_PREFABS_DIR=/path/to/data/visual_element_prefabs
|
| 845 |
+
```
|
| 846 |
+
|
| 847 |
+
### Docker Deployment (Railway)
|
| 848 |
+
```dockerfile
|
| 849 |
+
# Dockerfile (api service)
|
| 850 |
+
FROM python:3.11-slim
|
| 851 |
+
RUN apt-get update && apt-get install -y \
|
| 852 |
+
chromium chromium-driver \ # Playwright dependencies
|
| 853 |
+
libgl1 libglib2.0-0 \ # PaddleOCR dependencies
|
| 854 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 855 |
+
|
| 856 |
+
COPY api/ /app/api
|
| 857 |
+
COPY docgenie/ /app/docgenie
|
| 858 |
+
COPY data/ /app/data
|
| 859 |
+
WORKDIR /app/api
|
| 860 |
+
RUN pip install -r requirements.txt
|
| 861 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 862 |
+
```
|
| 863 |
+
|
| 864 |
+
**Handwriting service**: See `handwriting_service/Dockerfile` (deployed separately to RunPod)
|
| 865 |
+
|
| 866 |
+
---
|
| 867 |
+
|
| 868 |
+
## Performance & Costs
|
| 869 |
+
|
| 870 |
+
### Timing Breakdown (Single Document)
|
| 871 |
+
| Stage | Time | Notes |
|
| 872 |
+
|-------|------|-------|
|
| 873 |
+
| Seed download | 0.5-2s | Depends on image size + network |
|
| 874 |
+
| LLM prompt | 20-40s | Claude API latency |
|
| 875 |
+
| PDF render | 1-3s | Playwright initialization |
|
| 876 |
+
| Handwriting (10 words) | 180s | RunPod: 1 worker × (10×18s) |
|
| 877 |
+
| Visual elements | 0.5-1s | Local file selection |
|
| 878 |
+
| OCR | 3-5s | PaddleOCR inference |
|
| 879 |
+
| Dataset export | 0.5-1s | msgpack serialization |
|
| 880 |
+
| **TOTAL (no handwriting)** | **25-50s** |
|
| 881 |
+
| **TOTAL (with handwriting)** | **200-230s** | Batched |
|
| 882 |
+
|
| 883 |
+
### Cost Breakdown (Per Document)
|
| 884 |
+
| Component | Cost | Notes |
|
| 885 |
+
|-----------|------|-------|
|
| 886 |
+
| Claude API | $0.015-0.03 | ~5K input + 16K output tokens |
|
| 887 |
+
| RunPod GPU (10 words) | $0.045 | 180s × $0.00025/s |
|
| 888 |
+
| Storage | Negligible | Temporary files deleted |
|
| 889 |
+
| **TOTAL (no handwriting)** | **$0.015-0.03** |
|
| 890 |
+
| **TOTAL (with handwriting)** | **$0.06-0.08** |
|
| 891 |
+
|
| 892 |
+
**Optimization**: Batch multiple documents in ONE RunPod call to share worker activation overhead.
|
| 893 |
+
|
| 894 |
+
---
|
| 895 |
+
|
| 896 |
+
## Error Handling & Reliability
|
| 897 |
+
|
| 898 |
+
### Retry Mechanisms
|
| 899 |
+
1. **Seed image download**: 3 attempts, exponential backoff (2s, 4s, 8s)
|
| 900 |
+
2. **Handwriting service**: 3 attempts, status polling up to 30 times
|
| 901 |
+
3. **LLM API**: Built-in Anthropic SDK retries (rate limits, 529 errors)
|
| 902 |
+
|
| 903 |
+
### Failure Modes
|
| 904 |
+
| Error Type | Behavior | User Impact |
|
| 905 |
+
|------------|----------|-------------|
|
| 906 |
+
| Seed download failure | Raise HTTP 400 | Request rejected immediately |
|
| 907 |
+
| LLM API error | Raise HTTP 500 | No charge, can retry |
|
| 908 |
+
| Handwriting service failure | **Raise exception** (NEW) | Generation fails, prevents invalid outputs |
|
| 909 |
+
| OCR failure | Log warning, continue | Document generated without OCR data |
|
| 910 |
+
| PDF render failure | Raise HTTP 500 | Request fails, no partial results |
|
| 911 |
+
|
| 912 |
+
### Session Fixes Applied
|
| 913 |
+
- ✅ **Handwriting service failure now raises exception** (previously silent)
|
| 914 |
+
- ✅ **Seed parameter defaults to null** (previously 0)
|
| 915 |
+
- ✅ **Seed image download retry logic** (handles 503 timeout errors)
|
| 916 |
+
- ✅ **API docs show correct examples** (seed: null, not 0)
|
| 917 |
+
|
| 918 |
+
---
|
| 919 |
+
|
| 920 |
+
## Future Enhancements
|
| 921 |
+
|
| 922 |
+
### Short-term
|
| 923 |
+
1. **Configurable prompt templates** via API parameter
|
| 924 |
+
2. **Async endpoint progress tracking** (websocket or polling)
|
| 925 |
+
3. **Batch ZIP download** with multiple documents in one archive
|
| 926 |
+
4. **Cost estimation** before generation (preview mode)
|
| 927 |
+
|
| 928 |
+
### Long-term
|
| 929 |
+
1. **Custom visual element upload** (user-provided logos, signatures)
|
| 930 |
+
2. **Multi-page document support** (currently single-page only)
|
| 931 |
+
3. **Additional export formats** (COCO, YOLO, HuggingFace Datasets)
|
| 932 |
+
4. **Fine-tuning handwriting styles** (train on user's handwriting samples)
|
| 933 |
+
5. **LLM caching** (reduce cost for similar prompts)
|
| 934 |
+
|
| 935 |
+
---
|
| 936 |
+
|
| 937 |
+
## Troubleshooting
|
| 938 |
+
|
| 939 |
+
### Common Issues
|
| 940 |
+
|
| 941 |
+
**Q: "Handwriting service not called, but enable_handwriting=true"**
|
| 942 |
+
- Check: LLM output contains `class="handwritten"` in HTML
|
| 943 |
+
- Check: `handwriting_ratio` > 0 (default 0.2)
|
| 944 |
+
- Check: `HANDWRITING_SERVICE_ENABLED=true` in environment
|
| 945 |
+
- Debug: Look for "🔍 DEBUG - Handwriting Service Check" in logs
|
| 946 |
+
|
| 947 |
+
**Q: "RunPod job stuck IN_PROGRESS"**
|
| 948 |
+
- Cause: Large batch timing out
|
| 949 |
+
- Solution: Increase `HANDWRITING_SERVICE_TIMEOUT` (default 600s)
|
| 950 |
+
- Or: Reduce batch size by lowering `handwriting_ratio`
|
| 951 |
+
|
| 952 |
+
**Q: "503 first byte timeout" on seed download**
|
| 953 |
+
- Cause: CDN/storage provider temporary unavailability
|
| 954 |
+
- Solution: Retry logic automatically handles this (3 attempts)
|
| 955 |
+
- If persists: Use different image hosting (imgur, cloudinary)
|
| 956 |
+
|
| 957 |
+
**Q: "Seed parameter still shows 0 in API docs"**
|
| 958 |
+
- Fixed: Added `examples=[None, 42]` to Field definition
|
| 959 |
+
- Clear browser cache if seeing old docs
|
| 960 |
+
|
| 961 |
+
---
|
| 962 |
+
|
| 963 |
+
## Testing
|
| 964 |
+
|
| 965 |
+
### Unit Tests
|
| 966 |
+
```bash
|
| 967 |
+
# Test individual stages
|
| 968 |
+
pytest api/tests/test_utils.py::test_download_seed_images
|
| 969 |
+
pytest api/tests/test_utils.py::test_handwriting_service_batch
|
| 970 |
+
```
|
| 971 |
+
|
| 972 |
+
### Integration Tests
|
| 973 |
+
```bash
|
| 974 |
+
# Test sync endpoint (included in repo)
|
| 975 |
+
python api/test_sync_pdf_api.py
|
| 976 |
+
|
| 977 |
+
# Test async endpoint
|
| 978 |
+
python api/test_async_api.py
|
| 979 |
+
```
|
| 980 |
+
|
| 981 |
+
### Manual Testing via Docs UI
|
| 982 |
+
1. Navigate to `http://localhost:8000/docs`
|
| 983 |
+
2. Expand `/generate/pdf` endpoint
|
| 984 |
+
3. Click "Try it out"
|
| 985 |
+
4. Paste example request JSON
|
| 986 |
+
5. Click "Execute"
|
| 987 |
+
6. Download resulting ZIP file
|
| 988 |
+
|
| 989 |
+
### Example Test Request (Minimal)
|
| 990 |
+
```json
|
| 991 |
+
{
|
| 992 |
+
"seed_images": [
|
| 993 |
+
"https://i.imgur.com/example.jpg"
|
| 994 |
+
],
|
| 995 |
+
"prompt_params": {
|
| 996 |
+
"language": "english",
|
| 997 |
+
"doc_type": "invoice",
|
| 998 |
+
"num_solutions": 1,
|
| 999 |
+
"enable_handwriting": false,
|
| 1000 |
+
"enable_visual_elements": false,
|
| 1001 |
+
"enable_ocr": true,
|
| 1002 |
+
"enable_dataset_export": true
|
| 1003 |
+
}
|
| 1004 |
+
}
|
| 1005 |
+
```
|
| 1006 |
+
|
| 1007 |
+
---
|
| 1008 |
+
|
| 1009 |
+
## Conclusion
|
| 1010 |
+
|
| 1011 |
+
The DocGenie API successfully implements all 19 stages of the original batch pipeline in a request/response model suitable for real-time generation. Key architectural differences:
|
| 1012 |
+
|
| 1013 |
+
1. **Handwriting generation**: Offloaded to RunPod serverless (cost-efficient batching)
|
| 1014 |
+
2. **Seed selection**: User-provided URLs instead of pre-crawled dataset
|
| 1015 |
+
3. **State management**: Ephemeral in-memory processing vs file-based
|
| 1016 |
+
4. **Scalability**: Horizontal scaling via FastAPI workers + async processing
|
| 1017 |
+
|
| 1018 |
+
The API maintains feature parity with the batch pipeline while providing a simpler interface for integration with external systems (web apps, mobile apps, data pipelines).
|
| 1019 |
+
|
| 1020 |
+
**Total Processing Time**: 25-50s (no handwriting) or 200-230s (with handwriting)
|
| 1021 |
+
**Cost Per Document**: $0.015-0.08 depending on features
|
| 1022 |
+
**Output Formats**: PDF, PNG, msgpack, ZIP archive
|
| 1023 |
+
|
| 1024 |
+
For questions or issues, see `api/README.md` or `TESTING.md`.
|
ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🏗️ DocGenie Architecture & Dependency Resolution
|
| 2 |
+
|
| 3 |
+
## 📦 Package Structure
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
docgenie/ ← Root monorepo
|
| 7 |
+
├── docgenie/ ← Core package (importable)
|
| 8 |
+
│ ├── __init__.py
|
| 9 |
+
│ ├── generation/ ← Used by API
|
| 10 |
+
│ │ ├── pipeline_01/
|
| 11 |
+
│ │ │ └── claude_batching.py ← ClaudeBatchedClient
|
| 12 |
+
│ │ ├── pipeline_03/
|
| 13 |
+
│ │ ├── pipeline_04/
|
| 14 |
+
│ │ └── utils/
|
| 15 |
+
│ ├── evaluation/
|
| 16 |
+
│ └── utils/
|
| 17 |
+
│
|
| 18 |
+
├── api/ ← API Service (imports docgenie.*)
|
| 19 |
+
│ ├── main.py from docgenie import ENV
|
| 20 |
+
│ ├── worker.py from docgenie.generation.pipeline_01...
|
| 21 |
+
│ ├── utils.py from docgenie.generation...
|
| 22 |
+
│ └── requirements.txt Extra: Redis, Supabase, Google
|
| 23 |
+
│
|
| 24 |
+
├── handwriting_service/ ← GPU Service (NO docgenie imports!)
|
| 25 |
+
│ ├── main.py ✓ Self-contained
|
| 26 |
+
│ ├── inference.py ✓ No external deps
|
| 27 |
+
│ └── models.py
|
| 28 |
+
│
|
| 29 |
+
└── WordStylist/ ← Model code (used by handwriting)
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## 🔗 Dependency Graph
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 36 |
+
│ API Service │
|
| 37 |
+
│ ┌──────────────────────────────────────────────────────┐ │
|
| 38 |
+
│ │ api/main.py │ │
|
| 39 |
+
│ │ ↓ imports │ │
|
| 40 |
+
│ │ api/utils.py (call_claude_api_direct) │ │
|
| 41 |
+
│ └──────────────────────────────────────────────────────┘ │
|
| 42 |
+
│ │
|
| 43 |
+
│ ┌──────────────────────────────────────────────────────┐ │
|
| 44 |
+
│ │ api/worker.py │ │
|
| 45 |
+
│ │ ↓ imports │ │
|
| 46 |
+
│ │ from docgenie.generation.pipeline_01.claude_batching │ │
|
| 47 |
+
│ │ from docgenie.generation.constants │ │
|
| 48 |
+
│ │ from docgenie.generation.pipeline_03_process_response│ │
|
| 49 |
+
│ │ from docgenie.generation.pipeline_04_render_pdf... │ │
|
| 50 |
+
│ │ from docgenie import ENV │ │
|
| 51 |
+
│ └──────────────────────────────────────────────────────┘ │
|
| 52 |
+
│ ↓ │
|
| 53 |
+
│ REQUIRES │
|
| 54 |
+
│ ┌──────────────────────────────────────────────────────┐ │
|
| 55 |
+
│ │ docgenie/ package │ │
|
| 56 |
+
│ │ (entire generation module) │ │
|
| 57 |
+
│ └──────────────────────────────────────────────────────┘ │
|
| 58 |
+
└─────────────────────────────────────────────────────────────┘
|
| 59 |
+
|
| 60 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 61 |
+
│ Handwriting Service │
|
| 62 |
+
│ ┌──────────────────────────────────────────────────────┐ │
|
| 63 |
+
│ │ handwriting_service/main.py │ │
|
| 64 |
+
│ │ ↓ imports │ │
|
| 65 |
+
│ │ from handwriting_service.inference import ... │ │
|
| 66 |
+
│ │ from handwriting_service.models import ... │ │
|
| 67 |
+
│ └──────────────────────────────────────────────────────┘ │
|
| 68 |
+
│ ↓ │
|
| 69 |
+
│ REQUIRES │
|
| 70 |
+
│ ┌────────────────────────���─────────────────────────────┐ │
|
| 71 |
+
│ │ WordStylist/ model │ │
|
| 72 |
+
│ │ (diffusion model code) │ │
|
| 73 |
+
│ └──────────────────────────────────────────────────────┘ │
|
| 74 |
+
│ │
|
| 75 |
+
│ ✓ NO docgenie imports - completely independent! │
|
| 76 |
+
└─────────────────────────────────────────────────────────────┘
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## 🐳 Docker Build Strategy
|
| 80 |
+
|
| 81 |
+
### ❌ What Doesn't Work
|
| 82 |
+
|
| 83 |
+
```dockerfile
|
| 84 |
+
# ❌ WRONG: Can't copy just api/ folder
|
| 85 |
+
FROM python:3.11
|
| 86 |
+
COPY api/ /app/api/ # Missing docgenie package!
|
| 87 |
+
RUN pip install -r requirements.txt
|
| 88 |
+
CMD ["uvicorn", "main:app"] # ImportError: No module named 'docgenie'
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### ✅ What Works
|
| 92 |
+
|
| 93 |
+
```dockerfile
|
| 94 |
+
# ✅ CORRECT: Copy entire monorepo
|
| 95 |
+
FROM python:3.11
|
| 96 |
+
WORKDIR /app
|
| 97 |
+
|
| 98 |
+
# Copy everything
|
| 99 |
+
COPY . .
|
| 100 |
+
|
| 101 |
+
# Install docgenie as package
|
| 102 |
+
RUN pip install -e . # Makes docgenie.* importable
|
| 103 |
+
|
| 104 |
+
# Install API requirements
|
| 105 |
+
RUN pip install -r api/requirements.txt
|
| 106 |
+
|
| 107 |
+
WORKDIR /app/api
|
| 108 |
+
CMD ["uvicorn", "main:app"] # ✓ docgenie imports work!
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## 🚢 Deployment Strategy Comparison
|
| 112 |
+
|
| 113 |
+
### Option 1: Separate Deployments (❌ Won't Work)
|
| 114 |
+
|
| 115 |
+
```
|
| 116 |
+
API Deployment:
|
| 117 |
+
├── api/ folder only
|
| 118 |
+
└── ❌ Missing docgenie package → ImportError
|
| 119 |
+
|
| 120 |
+
Handwriting Deployment:
|
| 121 |
+
├── handwriting_service/ folder
|
| 122 |
+
└── WordStylist/
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
**Problem:** API can't find docgenie imports!
|
| 126 |
+
|
| 127 |
+
### Option 2: Monorepo Deployment (✅ Works)
|
| 128 |
+
|
| 129 |
+
```
|
| 130 |
+
API Deployment:
|
| 131 |
+
├── docgenie/ package (core)
|
| 132 |
+
├── api/ service (imports docgenie)
|
| 133 |
+
├── setup.py
|
| 134 |
+
└── requirements.txt
|
| 135 |
+
|
| 136 |
+
Handwriting Deployment:
|
| 137 |
+
├── handwriting_service/
|
| 138 |
+
└── WordStylist/
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
**Solution:** Deploy entire repo for API, standalone for handwriting!
|
| 142 |
+
|
| 143 |
+
## 📁 File Structure in Containers
|
| 144 |
+
|
| 145 |
+
### API Container (Railway/EC2)
|
| 146 |
+
```
|
| 147 |
+
/app/
|
| 148 |
+
├── docgenie/ ← Installed as Python package
|
| 149 |
+
│ ├── __init__.py
|
| 150 |
+
│ ├── generation/
|
| 151 |
+
│ └── utils/
|
| 152 |
+
├── api/ ← Working directory
|
| 153 |
+
│ ├── main.py
|
| 154 |
+
│ ├── worker.py
|
| 155 |
+
│ └── utils.py
|
| 156 |
+
├── setup.py
|
| 157 |
+
└── pyproject.toml
|
| 158 |
+
|
| 159 |
+
Python can import:
|
| 160 |
+
✓ from docgenie.generation.pipeline_01 import ...
|
| 161 |
+
✓ from docgenie import ENV
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
### Handwriting Container (RunPod)
|
| 165 |
+
```
|
| 166 |
+
/app/
|
| 167 |
+
├── handwriting_service/
|
| 168 |
+
│ ├── main.py ← No docgenie imports!
|
| 169 |
+
│ ├── inference.py
|
| 170 |
+
│ └── models.py
|
| 171 |
+
└── WordStylist/ ← Model code
|
| 172 |
+
├── ldm/
|
| 173 |
+
└── wordstylist_inference.py
|
| 174 |
+
|
| 175 |
+
Python can import:
|
| 176 |
+
✓ from handwriting_service.inference import ...
|
| 177 |
+
✓ No docgenie dependencies needed!
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
## 🎯 Import Resolution Flow
|
| 181 |
+
|
| 182 |
+
### API Service Import Chain
|
| 183 |
+
|
| 184 |
+
1. **FastAPI starts:**
|
| 185 |
+
```python
|
| 186 |
+
uvicorn main:app
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
2. **main.py imports utils:**
|
| 190 |
+
```python
|
| 191 |
+
from api.utils import call_claude_api_direct
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
3. **utils.py imports docgenie:**
|
| 195 |
+
```python
|
| 196 |
+
from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
4. **Python looks for docgenie:**
|
| 200 |
+
- Checks sys.path
|
| 201 |
+
- Finds `/app` (where `pip install -e .` installed it)
|
| 202 |
+
- Loads `docgenie/__init__.py`
|
| 203 |
+
- ✓ Import succeeds!
|
| 204 |
+
|
| 205 |
+
### Handwriting Service Import Chain
|
| 206 |
+
|
| 207 |
+
1. **FastAPI starts:**
|
| 208 |
+
```python
|
| 209 |
+
uvicorn main:app
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
2. **main.py imports local modules:**
|
| 213 |
+
```python
|
| 214 |
+
from handwriting_service.inference import HandwritingGenerator
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
3. **inference.py imports WordStylist:**
|
| 218 |
+
```python
|
| 219 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "WordStylist"))
|
| 220 |
+
from ldm.models.diffusion.ddpm import LatentDiffusion
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
4. **Python loads local modules:**
|
| 224 |
+
- No external package dependencies
|
| 225 |
+
- ✓ Completely self-contained!
|
| 226 |
+
|
| 227 |
+
## 🔍 Verifying Imports
|
| 228 |
+
|
| 229 |
+
### Test API Imports
|
| 230 |
+
```bash
|
| 231 |
+
# Inside API container
|
| 232 |
+
python3 -c "from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient; print('✓ Import works!')"
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### Test Handwriting Imports
|
| 236 |
+
```bash
|
| 237 |
+
# Inside handwriting container
|
| 238 |
+
python3 -c "from handwriting_service.inference import HandwritingGenerator; print('✓ Import works!')"
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
## 💡 Key Insights
|
| 242 |
+
|
| 243 |
+
1. **API needs monorepo:** Must deploy entire `docgenie/` folder structure
|
| 244 |
+
2. **Handwriting is independent:** Can deploy just `handwriting_service/` + `WordStylist/`
|
| 245 |
+
3. **Docker layer caching:** Install docgenie package first, then API requirements
|
| 246 |
+
4. **Working directory matters:** Set WORKDIR to /app/api for API service
|
| 247 |
+
5. **Python package installation:** `pip install -e .` makes docgenie importable globally
|
| 248 |
+
|
| 249 |
+
## 📊 Deployment Size Comparison
|
| 250 |
+
|
| 251 |
+
| Deployment | Size | Contents |
|
| 252 |
+
|------------|------|----------|
|
| 253 |
+
| API (Railway) | ~2GB | Python 3.11 + docgenie + API deps + Playwright |
|
| 254 |
+
| Worker (Railway) | ~2GB | Same as API (shares image) |
|
| 255 |
+
| Handwriting (RunPod) | ~8GB | CUDA 11.8 + PyTorch + Diffusers + WordStylist |
|
| 256 |
+
|
| 257 |
+
**Total:** ~12GB (but cached independently)
|
| 258 |
+
|
| 259 |
+
## ✅ Checklist for Successful Deployment
|
| 260 |
+
|
| 261 |
+
- [ ] Dockerfile copies **entire monorepo** for API
|
| 262 |
+
- [ ] `pip install -e .` runs before API requirements
|
| 263 |
+
- [ ] WORKDIR set to /app/api for runtime
|
| 264 |
+
- [ ] Handwriting Dockerfile copies only handwriting_service/ + WordStylist/
|
| 265 |
+
- [ ] .dockerignore excludes data/ folders (too large)
|
| 266 |
+
- [ ] Environment variables set in Railway/EC2
|
| 267 |
+
- [ ] Redis URL points to Upstash
|
| 268 |
+
- [ ] HANDWRITING_SERVICE_URL points to RunPod endpoint
|
| 269 |
+
|
| 270 |
+
## 🎉 Result
|
| 271 |
+
|
| 272 |
+
```
|
| 273 |
+
✓ API can import from docgenie package
|
| 274 |
+
✓ Worker can use ClaudeBatchedClient
|
| 275 |
+
✓ Handwriting service runs independently
|
| 276 |
+
✓ All services communicate via HTTP
|
| 277 |
+
✓ No more ImportError!
|
| 278 |
+
```
|
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,875 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 DocGenie Deployment Guide
|
| 2 |
+
|
| 3 |
+
Complete guide for deploying DocGenie API + Handwriting Service to production with all interdependencies resolved.
|
| 4 |
+
|
| 5 |
+
## 📊 System Architecture
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 9 |
+
│ Client │
|
| 10 |
+
└────────────────────┬────────────────────────────────────────┘
|
| 11 |
+
│
|
| 12 |
+
▼
|
| 13 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 14 |
+
│ Railway (CPU) │
|
| 15 |
+
│ ┌──────────────────────────────────────────────────────┐ │
|
| 16 |
+
│ │ DocGenie API (Port 8000) │ │
|
| 17 |
+
│ │ - FastAPI server │ │
|
| 18 |
+
│ │ - Imports: docgenie.generation.* │ │
|
| 19 |
+
│ │ - Endpoints: /generate, /generate/pdf, /generate/async│ │
|
| 20 |
+
│ └──────────────┬───────────────────────────────────────┘ │
|
| 21 |
+
│ │ │
|
| 22 |
+
│ ┌──────────────▼───────────────────────────────────────┐ │
|
| 23 |
+
│ │ Background Worker │ │
|
| 24 |
+
│ │ - RQ worker (Redis Queue) │ │
|
| 25 |
+
│ │ - ClaudeBatchedClient (50% cost savings) │ │
|
| 26 |
+
│ │ - Imports: docgenie.generation.* │ │
|
| 27 |
+
│ └──────────────┬───────────────────────────────────────┘ │
|
| 28 |
+
└─────────────────┼────────────────────────────────────────────┘
|
| 29 |
+
│
|
| 30 |
+
┌─────────┴──────────┬──────────────┐
|
| 31 |
+
│ │ │
|
| 32 |
+
▼ ▼ ▼
|
| 33 |
+
┌───────────────┐ ┌──────────────────┐ ┌──────────────┐
|
| 34 |
+
│ Redis (Upstash)│ │ Supabase │ │ Google Drive │
|
| 35 |
+
│ - Job queue │ │ - PostgreSQL │ │ - File storage│
|
| 36 |
+
│ - Free tier │ │ - Document DB │ │ - OAuth 2.0 │
|
| 37 |
+
└───────────────┘ └──────────────────┘ └──────────────┘
|
| 38 |
+
│
|
| 39 |
+
▼
|
| 40 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 41 |
+
│ RunPod Serverless (GPU) │
|
| 42 |
+
│ ┌──────────────────────────────────────────────────────┐ │
|
| 43 |
+
│ │ Handwriting Service (Port 8080) │ │
|
| 44 |
+
│ │ - WordStylist diffusion model │ │
|
| 45 |
+
│ │ - PyTorch + CUDA 11.8 │ │
|
| 46 |
+
│ │ - NO docgenie imports (standalone) │ │
|
| 47 |
+
│ └──────────────────────────────────────────────────────┘ │
|
| 48 |
+
└─────────────────────────────────────────────────────────────┘
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
## 🔗 Dependency Resolution
|
| 52 |
+
|
| 53 |
+
### ✅ Problem: API imports from docgenie package
|
| 54 |
+
**Solution:** Deploy entire monorepo, install as package with `pip install -e .`
|
| 55 |
+
|
| 56 |
+
**API Service imports:**
|
| 57 |
+
```python
|
| 58 |
+
# api/worker.py
|
| 59 |
+
from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
|
| 60 |
+
from docgenie import ENV
|
| 61 |
+
|
| 62 |
+
# api/utils.py
|
| 63 |
+
from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME
|
| 64 |
+
from docgenie.generation.pipeline_01.claude_batching import create_message
|
| 65 |
+
from docgenie.generation.pipeline_03_process_response import process_response
|
| 66 |
+
from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import render_pdf
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
**Dockerfile solution:**
|
| 70 |
+
```dockerfile
|
| 71 |
+
# Copy entire monorepo
|
| 72 |
+
COPY . .
|
| 73 |
+
|
| 74 |
+
# Install as editable package
|
| 75 |
+
RUN pip install -e .
|
| 76 |
+
|
| 77 |
+
# Install API requirements
|
| 78 |
+
RUN pip install -r api/requirements.txt
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### ✅ Handwriting Service is Independent
|
| 82 |
+
**No docgenie imports!** Can be deployed standalone.
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
# handwriting_service/main.py - NO docgenie imports
|
| 86 |
+
from handwriting_service.inference import HandwritingGenerator
|
| 87 |
+
from handwriting_service.models import HandwritingRequest
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
## 📦 Pre-Deployment Checklist
|
| 91 |
+
|
| 92 |
+
### 1. Environment Variables
|
| 93 |
+
Create `api/.env` with all required variables:
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
# Claude API
|
| 97 |
+
ANTHROPIC_API_KEY=sk-ant-xxxxx
|
| 98 |
+
|
| 99 |
+
# Redis (will be replaced with Upstash URL)
|
| 100 |
+
REDIS_URL=redis://localhost:6379
|
| 101 |
+
|
| 102 |
+
# Handwriting Service
|
| 103 |
+
HANDWRITING_SERVICE_URL=http://localhost:8080
|
| 104 |
+
|
| 105 |
+
# Supabase
|
| 106 |
+
SUPABASE_URL=https://xxxxx.supabase.co
|
| 107 |
+
SUPABASE_KEY=eyJxxxxx
|
| 108 |
+
|
| 109 |
+
# Google Drive (for token refresh only)
|
| 110 |
+
# The frontend handles OAuth and sends tokens in API requests
|
| 111 |
+
# These credentials are only needed to refresh expired tokens during long jobs
|
| 112 |
+
GOOGLE_CLIENT_ID=xxxxx.apps.googleusercontent.com
|
| 113 |
+
GOOGLE_CLIENT_SECRET=GOCSPX-xxxxx
|
| 114 |
+
GOOGLE_DRIVE_FOLDER_NAME=DocGenie Documents
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### 2. Test Locally First
|
| 118 |
+
```bash
|
| 119 |
+
# Terminal 1: Start Redis
|
| 120 |
+
docker run -p 6379:6379 redis:7-alpine
|
| 121 |
+
|
| 122 |
+
# Terminal 2: Start Handwriting Service
|
| 123 |
+
cd handwriting_service
|
| 124 |
+
DEVICE=cpu uvicorn main:app --port 8080
|
| 125 |
+
|
| 126 |
+
# Terminal 3: Start API
|
| 127 |
+
cd api
|
| 128 |
+
source ../.venv/bin/activate
|
| 129 |
+
uvicorn main:app --reload --port 8000
|
| 130 |
+
|
| 131 |
+
# Terminal 4: Start Worker
|
| 132 |
+
cd api
|
| 133 |
+
source ../.venv/bin/activate
|
| 134 |
+
python worker.py
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
Test endpoints:
|
| 138 |
+
```bash
|
| 139 |
+
# Health check
|
| 140 |
+
curl http://localhost:8000/health
|
| 141 |
+
|
| 142 |
+
# Async generation (uses batched API)
|
| 143 |
+
curl -X POST http://localhost:8000/generate/async \
|
| 144 |
+
-H "Content-Type: application/json" \
|
| 145 |
+
-d '{"template_name": "DocGenie", "num_pages": 2}'
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
## 🚢 Deployment Steps
|
| 149 |
+
|
| 150 |
+
### Option A: Railway + RunPod (RECOMMENDED - $10/month)
|
| 151 |
+
|
| 152 |
+
#### Step 1: Deploy Redis to Upstash (FREE)
|
| 153 |
+
|
| 154 |
+
1. Go to https://upstash.com
|
| 155 |
+
2. Create account → New Redis Database
|
| 156 |
+
3. Copy the `UPSTASH_REDIS_REST_URL` (looks like: `redis://default:xxxxx@xxxxx.upstash.io:6379`)
|
| 157 |
+
|
| 158 |
+
#### Step 2: Deploy Handwriting Service to RunPod
|
| 159 |
+
|
| 160 |
+
**Option A: Build from Git Repository (RECOMMENDED - No Docker Hub needed!)**
|
| 161 |
+
|
| 162 |
+
This builds directly on RunPod's servers, avoiding the need to upload 10GB over your internet.
|
| 163 |
+
|
| 164 |
+
1. **Prepare and push code to Git:**
|
| 165 |
+
```bash
|
| 166 |
+
cd /media/ahad-hassan/Volume_E/FYP/FYP/docgenie
|
| 167 |
+
|
| 168 |
+
# First, prepare optimized WordStylist (removes 432MB of unnecessary files)
|
| 169 |
+
cd handwriting_service
|
| 170 |
+
./prepare_build.sh
|
| 171 |
+
cd ..
|
| 172 |
+
|
| 173 |
+
# Now commit the optimized WordStylist
|
| 174 |
+
git add handwriting_service/
|
| 175 |
+
git status # Verify WordStylist is included (should show WordStylist/models/ema_ckpt.pt, etc.)
|
| 176 |
+
git commit -m "Add handwriting service with optimized WordStylist"
|
| 177 |
+
git push origin main
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
2. **Deploy to RunPod:**
|
| 181 |
+
- Go to https://runpod.io → Serverless → New Endpoint
|
| 182 |
+
- Click "Build from Git" (not Docker Image)
|
| 183 |
+
- Settings:
|
| 184 |
+
- Name: `docgenie-handwriting`
|
| 185 |
+
- Git URL: `https://github.com/Ahadhassan-2003/FYP.git`
|
| 186 |
+
- Git Branch: `main`
|
| 187 |
+
- Docker Build Context: `docgenie/handwriting_service`
|
| 188 |
+
- Dockerfile Path: `Dockerfile`
|
| 189 |
+
- GPU: RTX 4090 or A40
|
| 190 |
+
- Container Disk: 15GB
|
| 191 |
+
- Max Workers: 1
|
| 192 |
+
- Idle Timeout: 5 seconds
|
| 193 |
+
- Exposed Port: 8080
|
| 194 |
+
- Environment Variables:
|
| 195 |
+
```
|
| 196 |
+
DEVICE=cuda
|
| 197 |
+
PYTHONUNBUFFERED=1
|
| 198 |
+
```
|
| 199 |
+
- Build Args (prepare WordStylist):
|
| 200 |
+
```
|
| 201 |
+
PREPARE_WORDSTYLIST=true
|
| 202 |
+
```
|
| 203 |
+
- Click "Deploy"
|
| 204 |
+
|
| 205 |
+
RunPod will clone your repo and build the image on their fast servers!
|
| 206 |
+
|
| 207 |
+
**Option B: Pre-built Docker Image (if Git unavailable)**
|
| 208 |
+
|
| 209 |
+
<details>
|
| 210 |
+
<summary>Click to expand Docker Hub method</summary>
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
cd handwriting_service
|
| 214 |
+
|
| 215 |
+
# Prepare optimized build (removes 432MB)
|
| 216 |
+
./prepare_build.sh
|
| 217 |
+
|
| 218 |
+
# Login to Docker Hub
|
| 219 |
+
docker login
|
| 220 |
+
|
| 221 |
+
# Build image
|
| 222 |
+
docker buildx build --platform linux/amd64 \
|
| 223 |
+
-t yourusername/docgenie-handwriting:latest \
|
| 224 |
+
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
| 225 |
+
.
|
| 226 |
+
|
| 227 |
+
# Push to Docker Hub (may take 20-30 minutes for 10GB)
|
| 228 |
+
docker push yourusername/docgenie-handwriting:latest
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
Then deploy on RunPod:
|
| 232 |
+
- Go to https://runpod.io → Serverless → New Endpoint
|
| 233 |
+
- Docker Image: `yourusername/docgenie-handwriting:latest`
|
| 234 |
+
- GPU: RTX 4090 or A40
|
| 235 |
+
- Port: 8080
|
| 236 |
+
- Environment Variables: `DEVICE=cuda`
|
| 237 |
+
|
| 238 |
+
</details>
|
| 239 |
+
docker push ahadhassan/docgenie-handwriting:v2
|
| 240 |
+
3. **Get endpoint URL:**
|
| 241 |
+
- Copy the URL (looks like: `https://api.runpod.ai/v2/xxxxx/runsync`)
|
| 242 |
+
- This is your `HANDWRITING_SERVICE_URL`
|
| 243 |
+
|
| 244 |
+
#### Step 3: Deploy API to Railway
|
| 245 |
+
|
| 246 |
+
1. **Install Railway CLI:**
|
| 247 |
+
```bash
|
| 248 |
+
# Install Railway CLI
|
| 249 |
+
npm i -g @railway/cli
|
| 250 |
+
|
| 251 |
+
# Or use curl
|
| 252 |
+
bash <(curl -fsSL cli.new) railway
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
2. **Initialize Railway project:**
|
| 256 |
+
```bash
|
| 257 |
+
cd /media/ahad-hassan/Volume_E/FYP/FYP/docgenie
|
| 258 |
+
|
| 259 |
+
# Login to Railway
|
| 260 |
+
railway login
|
| 261 |
+
|
| 262 |
+
# Create new project
|
| 263 |
+
railway init
|
| 264 |
+
|
| 265 |
+
# Link to project (creates railway.json)
|
| 266 |
+
railway link
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
3. **Set environment variables:**
|
| 270 |
+
```bash
|
| 271 |
+
# Set all environment variables from api/.env
|
| 272 |
+
railway variables set ANTHROPIC_API_KEY=sk-ant-xxxxx
|
| 273 |
+
railway variables set REDIS_URL=redis://default:xxxxx@xxxxx.upstash.io:6379
|
| 274 |
+
railway variables set HANDWRITING_SERVICE_URL=https://api.runpod.ai/v2/xxxxx/runsync
|
| 275 |
+
railway variables set SUPABASE_URL=https://xxxxx.supabase.co
|
| 276 |
+
railway variables set SUPABASE_KEY=eyJxxxxx
|
| 277 |
+
|
| 278 |
+
# Google OAuth (for token refresh only - frontend provides tokens in requests)
|
| 279 |
+
railway variables set GOOGLE_CLIENT_ID=xxxxx.apps.googleusercontent.com
|
| 280 |
+
railway variables set GOOGLE_CLIENT_SECRET=GOCSPX-xxxxx
|
| 281 |
+
railway variables set GOOGLE_DRIVE_FOLDER_NAME="DocGenie Documents"
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
**Note:** Google access/refresh tokens are NOT environment variables! The frontend authenticates with Google OAuth, then passes `google_drive_token` and `google_drive_refresh_token` in the API request body. See [API request schema](api/schemas.py#L108-L114).
|
| 285 |
+
|
| 286 |
+
4. **Deploy API + Worker:**
|
| 287 |
+
```bash
|
| 288 |
+
# Railway will detect Dockerfile and deploy automatically
|
| 289 |
+
railway up
|
| 290 |
+
|
| 291 |
+
# Or connect to GitHub and deploy from there
|
| 292 |
+
railway connect
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
5. **Option 1: Separate Worker Service (For Production Scale):**
|
| 296 |
+
|
| 297 |
+
*Note: Only needed if processing 50+ concurrent jobs. For most use cases, Option 2 (combined) is sufficient.*
|
| 298 |
+
|
| 299 |
+
**Method A: Connect to Same GitHub Repo (Recommended)**
|
| 300 |
+
- Go to Railway dashboard → Your project → **New Service**
|
| 301 |
+
- Click **"GitHub Repo"** → Select your repo
|
| 302 |
+
- Name: `docgenie-worker`
|
| 303 |
+
- **Settings** → **Deploy**:
|
| 304 |
+
- Builder: `DOCKERFILE`
|
| 305 |
+
- Dockerfile Path: `Dockerfile`
|
| 306 |
+
- Root Directory: `/` (same as API)
|
| 307 |
+
- **Custom Start Command**:
|
| 308 |
+
```bash
|
| 309 |
+
rq worker --url $REDIS_URL
|
| 310 |
+
```
|
| 311 |
+
- **Variables**: Add all environment variables (same as API service)
|
| 312 |
+
- **Deploy**
|
| 313 |
+
|
| 314 |
+
**Method B: Use Same Docker Image as API**
|
| 315 |
+
- Railway dashboard → New Service → **Empty Service**
|
| 316 |
+
- Name: `docgenie-worker`
|
| 317 |
+
- **Settings** → **Source**: Link to API service's image
|
| 318 |
+
- **Custom Start Command**: `rq worker --url $REDIS_URL`
|
| 319 |
+
- **Variables**: Copy from API service
|
| 320 |
+
- **Deploy**
|
| 321 |
+
|
| 322 |
+
6. **Option 2: Combined API + Worker (Recommended for Getting Started):**
|
| 323 |
+
|
| 324 |
+
Update `railway.json` to run both in one service:
|
| 325 |
+
```json
|
| 326 |
+
{
|
| 327 |
+
"deploy": {
|
| 328 |
+
"startCommand": "uvicorn api.main:app --host 0.0.0.0 --port $PORT & rq worker --url $REDIS_URL & wait"
|
| 329 |
+
}
|
| 330 |
+
}
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
Then push:
|
| 334 |
+
```bash
|
| 335 |
+
git add railway.json
|
| 336 |
+
git commit -m "feat: Run API and worker in combined service"
|
| 337 |
+
git push
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
**Benefits:**
|
| 341 |
+
- ✅ Single service ($5/month instead of $10/month)
|
| 342 |
+
- ✅ Simpler logs and monitoring
|
| 343 |
+
- ✅ Automatic scaling together
|
| 344 |
+
- ✅ Good for 90% of use cases
|
| 345 |
+
|
| 346 |
+
7. **Get API URL:**
|
| 347 |
+
- Railway dashboard → API service → Settings → Domains
|
| 348 |
+
- Generate domain (e.g., `docgenie-api.up.railway.app`)
|
| 349 |
+
|
| 350 |
+
#### Step 4: Update Frontend
|
| 351 |
+
|
| 352 |
+
Update your frontend API URL to Railway domain:
|
| 353 |
+
```javascript
|
| 354 |
+
const API_URL = 'https://docgenie-api.up.railway.app';
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
### Option B: AWS EC2 + RunPod (For Production)
|
| 358 |
+
|
| 359 |
+
#### Prerequisites
|
| 360 |
+
- AWS account with EC2 access
|
| 361 |
+
- Domain name (optional, for SSL)
|
| 362 |
+
|
| 363 |
+
#### Step 1: Launch EC2 Instance
|
| 364 |
+
|
| 365 |
+
```bash
|
| 366 |
+
# Launch t3.medium instance
|
| 367 |
+
aws ec2 run-instances \
|
| 368 |
+
--image-id ami-0c55b159cbfafe1f0 \
|
| 369 |
+
--instance-type t3.medium \
|
| 370 |
+
--key-name your-key-pair \
|
| 371 |
+
--security-group-ids sg-xxxxx \
|
| 372 |
+
--subnet-id subnet-xxxxx
|
| 373 |
+
```
|
| 374 |
+
|
| 375 |
+
**Security Group Rules:**
|
| 376 |
+
- Port 22 (SSH) - Your IP only
|
| 377 |
+
- Port 80 (HTTP) - 0.0.0.0/0
|
| 378 |
+
- Port 443 (HTTPS) - 0.0.0.0/0
|
| 379 |
+
- Port 8000 (API) - 0.0.0.0/0
|
| 380 |
+
|
| 381 |
+
#### Step 2: Setup EC2
|
| 382 |
+
|
| 383 |
+
```bash
|
| 384 |
+
# SSH into instance
|
| 385 |
+
ssh -i your-key.pem ubuntu@your-ec2-ip
|
| 386 |
+
|
| 387 |
+
# Update system
|
| 388 |
+
sudo apt update && sudo apt upgrade -y
|
| 389 |
+
|
| 390 |
+
# Install Docker
|
| 391 |
+
curl -fsSL https://get.docker.com -o get-docker.sh
|
| 392 |
+
sudo sh get-docker.sh
|
| 393 |
+
sudo usermod -aG docker ubuntu
|
| 394 |
+
|
| 395 |
+
# Install Docker Compose
|
| 396 |
+
sudo apt install docker-compose-plugin -y
|
| 397 |
+
|
| 398 |
+
# Install Git
|
| 399 |
+
sudo apt install git -y
|
| 400 |
+
|
| 401 |
+
# Clone repository
|
| 402 |
+
git clone https://gitlab.cs.hs-rm.de/diss_lamott/docgenie.git
|
| 403 |
+
cd docgenie
|
| 404 |
+
```
|
| 405 |
+
|
| 406 |
+
#### Step 3: Configure Environment
|
| 407 |
+
|
| 408 |
+
```bash
|
| 409 |
+
# Create .env file
|
| 410 |
+
cd api
|
| 411 |
+
nano .env
|
| 412 |
+
|
| 413 |
+
# Paste all environment variables
|
| 414 |
+
# Save: Ctrl+X, Y, Enter
|
| 415 |
+
|
| 416 |
+
# Update REDIS_URL to use Upstash
|
| 417 |
+
# Update HANDWRITING_SERVICE_URL to RunPod endpoint
|
| 418 |
+
```
|
| 419 |
+
|
| 420 |
+
#### Step 4: Deploy with Docker Compose
|
| 421 |
+
|
| 422 |
+
```bash
|
| 423 |
+
cd /home/ubuntu/docgenie
|
| 424 |
+
|
| 425 |
+
# Start services (API + Worker + Redis)
|
| 426 |
+
docker-compose up -d api worker redis
|
| 427 |
+
|
| 428 |
+
# Check logs
|
| 429 |
+
docker-compose logs -f api
|
| 430 |
+
docker-compose logs -f worker
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
#### Step 5: Setup Nginx Reverse Proxy
|
| 434 |
+
|
| 435 |
+
```bash
|
| 436 |
+
# Install Nginx
|
| 437 |
+
sudo apt install nginx -y
|
| 438 |
+
|
| 439 |
+
# Create config
|
| 440 |
+
sudo nano /etc/nginx/sites-available/docgenie
|
| 441 |
+
|
| 442 |
+
# Paste configuration:
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
```nginx
|
| 446 |
+
server {
|
| 447 |
+
listen 80;
|
| 448 |
+
server_name your-domain.com; # Or use EC2 IP
|
| 449 |
+
|
| 450 |
+
location / {
|
| 451 |
+
proxy_pass http://localhost:8000;
|
| 452 |
+
proxy_http_version 1.1;
|
| 453 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 454 |
+
proxy_set_header Connection 'upgrade';
|
| 455 |
+
proxy_set_header Host $host;
|
| 456 |
+
proxy_cache_bypass $http_upgrade;
|
| 457 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 458 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 459 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 460 |
+
|
| 461 |
+
# Increase timeout for long-running requests
|
| 462 |
+
proxy_read_timeout 300s;
|
| 463 |
+
proxy_connect_timeout 75s;
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
```
|
| 467 |
+
|
| 468 |
+
```bash
|
| 469 |
+
# Enable site
|
| 470 |
+
sudo ln -s /etc/nginx/sites-available/docgenie /etc/nginx/sites-enabled/
|
| 471 |
+
sudo nginx -t
|
| 472 |
+
sudo systemctl restart nginx
|
| 473 |
+
|
| 474 |
+
# Optional: Setup SSL with Let's Encrypt
|
| 475 |
+
sudo apt install certbot python3-certbot-nginx -y
|
| 476 |
+
sudo certbot --nginx -d your-domain.com
|
| 477 |
+
```
|
| 478 |
+
|
| 479 |
+
#### Step 6: Setup Systemd Service (Auto-restart)
|
| 480 |
+
|
| 481 |
+
```bash
|
| 482 |
+
# Create service file
|
| 483 |
+
sudo nano /etc/systemd/system/docgenie.service
|
| 484 |
+
```
|
| 485 |
+
|
| 486 |
+
```ini
|
| 487 |
+
[Unit]
|
| 488 |
+
Description=DocGenie API
|
| 489 |
+
After=docker.service
|
| 490 |
+
Requires=docker.service
|
| 491 |
+
|
| 492 |
+
[Service]
|
| 493 |
+
Type=oneshot
|
| 494 |
+
RemainAfterExit=yes
|
| 495 |
+
WorkingDirectory=/home/ubuntu/docgenie
|
| 496 |
+
ExecStart=/usr/bin/docker-compose up -d api worker redis
|
| 497 |
+
ExecStop=/usr/bin/docker-compose down
|
| 498 |
+
User=ubuntu
|
| 499 |
+
|
| 500 |
+
[Install]
|
| 501 |
+
WantedBy=multi-user.target
|
| 502 |
+
```
|
| 503 |
+
|
| 504 |
+
```bash
|
| 505 |
+
# Enable service
|
| 506 |
+
sudo systemctl daemon-reload
|
| 507 |
+
sudo systemctl enable docgenie
|
| 508 |
+
sudo systemctl start docgenie
|
| 509 |
+
|
| 510 |
+
# Check status
|
| 511 |
+
sudo systemctl status docgenie
|
| 512 |
+
```
|
| 513 |
+
|
| 514 |
+
## 🧪 Testing Production Deployment
|
| 515 |
+
|
| 516 |
+
### 1. Health Check
|
| 517 |
+
```bash
|
| 518 |
+
curl https://your-domain.com/health
|
| 519 |
+
```
|
| 520 |
+
|
| 521 |
+
### 2. Sync Generation (Fast)
|
| 522 |
+
```bash
|
| 523 |
+
curl -X POST https://your-domain.com/generate \
|
| 524 |
+
-H "Content-Type: application/json" \
|
| 525 |
+
-d '{
|
| 526 |
+
"template_name": "DocGenie",
|
| 527 |
+
"num_pages": 1
|
| 528 |
+
}'
|
| 529 |
+
```
|
| 530 |
+
|
| 531 |
+
### 3. Async Generation (Batched, Cheap)
|
| 532 |
+
```bash
|
| 533 |
+
# Start async job
|
| 534 |
+
RESPONSE=$(curl -X POST https://your-domain.com/generate/async \
|
| 535 |
+
-H "Content-Type: application/json" \
|
| 536 |
+
-d '{
|
| 537 |
+
"template_name": "DocGenie",
|
| 538 |
+
"num_pages": 2
|
| 539 |
+
}')
|
| 540 |
+
|
| 541 |
+
REQUEST_ID=$(echo $RESPONSE | jq -r '.request_id')
|
| 542 |
+
echo "Request ID: $REQUEST_ID"
|
| 543 |
+
|
| 544 |
+
# Poll status
|
| 545 |
+
while true; do
|
| 546 |
+
STATUS=$(curl -s https://your-domain.com/jobs/$REQUEST_ID/status | jq -r '.status')
|
| 547 |
+
echo "Status: $STATUS"
|
| 548 |
+
if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then
|
| 549 |
+
break
|
| 550 |
+
fi
|
| 551 |
+
sleep 10
|
| 552 |
+
done
|
| 553 |
+
|
| 554 |
+
# Get result
|
| 555 |
+
curl https://your-domain.com/jobs/$REQUEST_ID/status | jq
|
| 556 |
+
```
|
| 557 |
+
|
| 558 |
+
## 📊 Cost Breakdown
|
| 559 |
+
|
| 560 |
+
### Railway + RunPod (Recommended)
|
| 561 |
+
| Service | Cost | Notes |
|
| 562 |
+
|---------|------|-------|
|
| 563 |
+
| Railway (API + Worker) | $5-10/month | Includes 500 hours |
|
| 564 |
+
| Upstash Redis | FREE | 10K requests/day |
|
| 565 |
+
| RunPod Serverless GPU | $0.20/hr | Only charged when active |
|
| 566 |
+
| Supabase | FREE | 500MB database |
|
| 567 |
+
| **Total** | **~$10-15/month** | + $0.20/hr GPU usage |
|
| 568 |
+
|
| 569 |
+
### EC2 + RunPod
|
| 570 |
+
| Service | Cost | Notes |
|
| 571 |
+
|---------|------|-------|
|
| 572 |
+
| EC2 t3.medium | $30/month | 2 vCPU, 4GB RAM |
|
| 573 |
+
| Upstash Redis | FREE | External Redis |
|
| 574 |
+
| RunPod Serverless GPU | $0.20/hr | Only when needed |
|
| 575 |
+
| Supabase | FREE | External DB |
|
| 576 |
+
| **Total** | **~$30/month** | + $0.20/hr GPU usage |
|
| 577 |
+
|
| 578 |
+
### EC2 + Dedicated GPU (Production)
|
| 579 |
+
| Service | Cost | Notes |
|
| 580 |
+
|---------|------|-------|
|
| 581 |
+
| EC2 g4dn.xlarge | $150/month | 4 vCPU, 16GB RAM, T4 GPU |
|
| 582 |
+
| Supabase | FREE | External DB |
|
| 583 |
+
| **Total** | **~$150/month** | All-in-one solution |
|
| 584 |
+
|
| 585 |
+
## 🔧 Maintenance
|
| 586 |
+
|
| 587 |
+
### Update Deployment
|
| 588 |
+
|
| 589 |
+
**Railway:**
|
| 590 |
+
```bash
|
| 591 |
+
# Push to main branch (auto-deploy)
|
| 592 |
+
git push origin main
|
| 593 |
+
|
| 594 |
+
# Or manual deploy
|
| 595 |
+
railway up
|
| 596 |
+
```
|
| 597 |
+
|
| 598 |
+
**EC2:**
|
| 599 |
+
```bash
|
| 600 |
+
ssh ubuntu@your-ec2-ip
|
| 601 |
+
cd docgenie
|
| 602 |
+
git pull
|
| 603 |
+
docker-compose down
|
| 604 |
+
docker-compose up -d --build
|
| 605 |
+
```
|
| 606 |
+
|
| 607 |
+
### View Logs
|
| 608 |
+
|
| 609 |
+
**Railway:**
|
| 610 |
+
```bash
|
| 611 |
+
railway logs
|
| 612 |
+
```
|
| 613 |
+
|
| 614 |
+
**EC2:**
|
| 615 |
+
```bash
|
| 616 |
+
# API logs
|
| 617 |
+
docker-compose logs -f api
|
| 618 |
+
|
| 619 |
+
# Worker logs
|
| 620 |
+
docker-compose logs -f worker
|
| 621 |
+
|
| 622 |
+
# Nginx logs
|
| 623 |
+
sudo tail -f /var/log/nginx/access.log
|
| 624 |
+
sudo tail -f /var/log/nginx/error.log
|
| 625 |
+
```
|
| 626 |
+
|
| 627 |
+
### Monitor Redis Queue
|
| 628 |
+
|
| 629 |
+
```bash
|
| 630 |
+
# Connect to Redis
|
| 631 |
+
redis-cli -u $REDIS_URL
|
| 632 |
+
|
| 633 |
+
# Check queue status
|
| 634 |
+
> LLEN rq:queue:default
|
| 635 |
+
> LRANGE rq:queue:default 0 -1
|
| 636 |
+
```
|
| 637 |
+
|
| 638 |
+
## 🚨 Troubleshooting
|
| 639 |
+
|
| 640 |
+
### Issue: Worker can't import docgenie package
|
| 641 |
+
**Solution:** Dockerfile installs entire monorepo with `pip install -e .`
|
| 642 |
+
|
| 643 |
+
### Issue: Handwriting service connection timeout
|
| 644 |
+
**Solution:** Use RunPod's `/runsync` endpoint, not `/run` (synchronous)
|
| 645 |
+
|
| 646 |
+
### Issue: Google token expired during job
|
| 647 |
+
**Solution:** Ensure `GOOGLE_REFRESH_TOKEN`, `GOOGLE_CLIENT_ID`, `GOOGLE_CLIENT_SECRET` are set
|
| 648 |
+
|
| 649 |
+
### Issue: Railway build fails (too large)
|
| 650 |
+
**Solution:** Check `.dockerignore` excludes `data/` folders
|
| 651 |
+
|
| 652 |
+
### Issue: Worker heartbeat timeout
|
| 653 |
+
**Solution:** Job is still running, batched API takes 10-30 minutes
|
| 654 |
+
|
| 655 |
+
## 📚 Next Steps
|
| 656 |
+
|
| 657 |
+
1. **Monitor costs:** Railway dashboard, RunPod usage page
|
| 658 |
+
2. **Setup alerts:** Railway → Settings → Notifications
|
| 659 |
+
3. **Scale workers:** Railway → Worker service → Settings → Replicas
|
| 660 |
+
4. **Add caching:** Redis cache for generated documents
|
| 661 |
+
5. **Setup CI/CD:** GitHub Actions → Railway auto-deploy
|
| 662 |
+
|
| 663 |
+
## 🎉 You're Done!
|
| 664 |
+
|
| 665 |
+
Your DocGenie API is now deployed with:
|
| 666 |
+
- ✅ All docgenie package imports resolved
|
| 667 |
+
- ✅ GPU handwriting service on RunPod
|
| 668 |
+
- ✅ Background workers for batched API
|
| 669 |
+
- ✅ Auto-scaling and cost optimization
|
| 670 |
+
- ✅ Google token refresh working
|
| 671 |
+
- ✅ Database schema compatibility
|
| 672 |
+
|
| 673 |
+
**API URL:** `https://your-domain.com`
|
| 674 |
+
**Docs:** `https://your-domain.com/docs`
|
| 675 |
+
**Health:** `https://your-domain.com/health`
|
| 676 |
+
|
| 677 |
+
---
|
| 678 |
+
|
| 679 |
+
## 🖥️ Local Testing Guide
|
| 680 |
+
|
| 681 |
+
### Architecture
|
| 682 |
+
|
| 683 |
+
```
|
| 684 |
+
┌─────────────────────────────────┐
|
| 685 |
+
│ DocGenie API (Port 8000) │──┐ HTTP
|
| 686 |
+
└─────────────────────────────────┘ │ localhost:8080
|
| 687 |
+
▼
|
| 688 |
+
┌─────────────────────────────────┐
|
| 689 |
+
│ Handwriting Service (Port 8080) │
|
| 690 |
+
│ - Loads WordStylist model │
|
| 691 |
+
└─────────────────────────────────┘
|
| 692 |
+
```
|
| 693 |
+
|
| 694 |
+
### Prerequisites
|
| 695 |
+
|
| 696 |
+
1. **Python environment**: `source .venv/bin/activate`
|
| 697 |
+
2. **WordStylist Model** at `WordStylist/models/ckpt.pt` and `ema_ckpt.pt`
|
| 698 |
+
3. **`api/.env`** with `ANTHROPIC_API_KEY`, `HANDWRITING_SERVICE_ENABLED=true`, `HANDWRITING_SERVICE_URL=http://localhost:8080`
|
| 699 |
+
|
| 700 |
+
### Step-by-Step Setup
|
| 701 |
+
|
| 702 |
+
**Terminal 1 – Handwriting Service:**
|
| 703 |
+
```bash
|
| 704 |
+
cd handwriting_service
|
| 705 |
+
DEVICE=cpu ./start.sh # CPU (no GPU required)
|
| 706 |
+
# DEVICE=cuda ./start.sh # GPU (faster)
|
| 707 |
+
```
|
| 708 |
+
|
| 709 |
+
**Terminal 2 – DocGenie API:**
|
| 710 |
+
```bash
|
| 711 |
+
cd api
|
| 712 |
+
uvicorn main:app --reload
|
| 713 |
+
```
|
| 714 |
+
|
| 715 |
+
**Terminal 3 – Test:**
|
| 716 |
+
```bash
|
| 717 |
+
curl http://localhost:8080/health # Handwriting service
|
| 718 |
+
curl http://localhost:8000/health # API
|
| 719 |
+
cd api && python test_api.py
|
| 720 |
+
```
|
| 721 |
+
|
| 722 |
+
### Performance Notes
|
| 723 |
+
- CPU mode: ~5–10 s/word | GPU mode: ~0.5–1 s/word
|
| 724 |
+
- Service processes all words in one batch for efficiency
|
| 725 |
+
|
| 726 |
+
---
|
| 727 |
+
|
| 728 |
+
## ⚙️ Railway-Specific Configuration
|
| 729 |
+
|
| 730 |
+
### Critical Issues & Fixes
|
| 731 |
+
|
| 732 |
+
**1. `.dockerignore` – Keep required data folders:**
|
| 733 |
+
```
|
| 734 |
+
!data/prompt_templates/
|
| 735 |
+
!data/visual_element_prefabs/
|
| 736 |
+
```
|
| 737 |
+
|
| 738 |
+
**2. `railway.json` – Start both API and worker:**
|
| 739 |
+
```json
|
| 740 |
+
"startCommand": "cd api && uvicorn main:app --host 0.0.0.0 --port $PORT & rq worker --url $REDIS_URL & wait"
|
| 741 |
+
```
|
| 742 |
+
|
| 743 |
+
### Environment Variables
|
| 744 |
+
|
| 745 |
+
#### 🔴 Required
|
| 746 |
+
```bash
|
| 747 |
+
ANTHROPIC_API_KEY=sk-ant-api03-xxx
|
| 748 |
+
REDIS_URL=rediss://default:xxx@xxx.upstash.io:6379
|
| 749 |
+
HANDWRITING_SERVICE_URL=https://api.runpod.ai/v2/ht9ajgrduitgpr/runsync
|
| 750 |
+
HANDWRITING_SERVICE_ENABLED=true
|
| 751 |
+
SUPABASE_URL=https://xxx.supabase.co
|
| 752 |
+
SUPABASE_KEY=xxx
|
| 753 |
+
GOOGLE_CLIENT_ID=xxx.apps.googleusercontent.com
|
| 754 |
+
GOOGLE_CLIENT_SECRET=xxx
|
| 755 |
+
```
|
| 756 |
+
|
| 757 |
+
#### 🟡 Recommended
|
| 758 |
+
```bash
|
| 759 |
+
RUNPOD_API_KEY=xxx
|
| 760 |
+
OCR_SERVICE_ENABLED=true
|
| 761 |
+
OCR_USE_LOCAL=true
|
| 762 |
+
OCR_ENGINE=microsoft_di
|
| 763 |
+
OCR_DPI=300
|
| 764 |
+
HANDWRITING_SERVICE_TIMEOUT=300
|
| 765 |
+
HANDWRITING_SERVICE_MAX_RETRIES=3
|
| 766 |
+
RQ_QUEUE_NAME=docgenie
|
| 767 |
+
LOG_LEVEL=INFO
|
| 768 |
+
```
|
| 769 |
+
|
| 770 |
+
#### 🟢 Optional (defaults are fine)
|
| 771 |
+
```bash
|
| 772 |
+
API_HOST=0.0.0.0
|
| 773 |
+
API_PORT=8000
|
| 774 |
+
DEBUG_MODE=false
|
| 775 |
+
CLAUDE_MODEL=claude-sonnet-4-5-20250929
|
| 776 |
+
CORS_ORIGINS=*
|
| 777 |
+
GOOGLE_DRIVE_FOLDER_NAME=DocGenie Documents
|
| 778 |
+
TEMP_DIR=/tmp/docgenie_api
|
| 779 |
+
HANDWRITING_APPLY_BLUR=false
|
| 780 |
+
BBOX_NORMALIZATION_ENABLED=false
|
| 781 |
+
GT_VERIFICATION_ENABLED=false
|
| 782 |
+
ANALYSIS_ENABLED=false
|
| 783 |
+
DEBUG_VISUALIZATION_ENABLED=false
|
| 784 |
+
```
|
| 785 |
+
|
| 786 |
+
### Validation Steps
|
| 787 |
+
|
| 788 |
+
```bash
|
| 789 |
+
# 1. Health check
|
| 790 |
+
curl https://your-app.up.railway.app/health
|
| 791 |
+
|
| 792 |
+
# 2. Sync generation
|
| 793 |
+
curl -X POST https://your-app.up.railway.app/api/generate \
|
| 794 |
+
-H "Content-Type: application/json" \
|
| 795 |
+
-d '{"document_category": "invoice", "pages": 1}'
|
| 796 |
+
|
| 797 |
+
# 3. Async generation
|
| 798 |
+
curl -X POST https://your-app.up.railway.app/api/async/generate \
|
| 799 |
+
-H "Content-Type: application/json" \
|
| 800 |
+
-d '{"document_category": "invoice", "pages": 1, "google_access_token": "ya29.xxx"}'
|
| 801 |
+
```
|
| 802 |
+
|
| 803 |
+
### Common Railway Issues
|
| 804 |
+
|
| 805 |
+
| Issue | Cause | Solution |
|
| 806 |
+
|-------|-------|----------|
|
| 807 |
+
| Worker not starting | Missing `rq worker` in start command | Check `railway.json` `startCommand` |
|
| 808 |
+
| Missing prompt templates | `.dockerignore` too aggressive | Add `!data/prompt_templates/` |
|
| 809 |
+
| Playwright errors | Browser not installed | Ensure `playwright install chromium` in Dockerfile |
|
| 810 |
+
| Redis connection errors | Wrong `REDIS_URL` | Verify in Railway env variables |
|
| 811 |
+
| Handwriting timeout | Batch too large | Increase `HANDWRITING_SERVICE_TIMEOUT` |
|
| 812 |
+
| Large Docker image | `data/` folders included | Check `.dockerignore` excludes datasets/embeddings |
|
| 813 |
+
|
| 814 |
+
---
|
| 815 |
+
|
| 816 |
+
## ⚡ RunPod Batch Optimization
|
| 817 |
+
|
| 818 |
+
### Problem (Old Parallel Processing)
|
| 819 |
+
Each text was sent as a separate RunPod request → N texts = N workers = N× activation cost.
|
| 820 |
+
|
| 821 |
+
**Example:** 10 texts → 10 workers × 18 s = 180 worker-seconds + 10× activation fees
|
| 822 |
+
|
| 823 |
+
### Solution (New Batch Processing)
|
| 824 |
+
All texts sent in **one** RunPod request → 1 worker handles everything.
|
| 825 |
+
|
| 826 |
+
**Example:** 10 texts → 1 worker × 190 s = 190 worker-seconds + 1× activation fee
|
| 827 |
+
**Savings: ~45–60% cost reduction** (activation fees dominate RunPod pricing)
|
| 828 |
+
|
| 829 |
+
### Batch Request Format (handler.py)
|
| 830 |
+
|
| 831 |
+
```json
|
| 832 |
+
{
|
| 833 |
+
"input": {
|
| 834 |
+
"texts": [
|
| 835 |
+
{"text": "Hello", "author_id": 42, "hw_id": "hw_0"},
|
| 836 |
+
{"text": "World", "author_id": 42, "hw_id": "hw_1"}
|
| 837 |
+
],
|
| 838 |
+
"apply_blur": true
|
| 839 |
+
}
|
| 840 |
+
}
|
| 841 |
+
```
|
| 842 |
+
|
| 843 |
+
**Response:**
|
| 844 |
+
```json
|
| 845 |
+
{
|
| 846 |
+
"status": "COMPLETED",
|
| 847 |
+
"output": {
|
| 848 |
+
"images": [
|
| 849 |
+
{"image_base64": "...", "width": 217, "height": 61, "text": "Hello", "author_id": 42, "hw_id": "hw_0"},
|
| 850 |
+
{"image_base64": "...", "width": 195, "height": 58, "text": "World", "author_id": 42, "hw_id": "hw_1"}
|
| 851 |
+
],
|
| 852 |
+
"total_generated": 2
|
| 853 |
+
}
|
| 854 |
+
}
|
| 855 |
+
```
|
| 856 |
+
|
| 857 |
+
> **Note:** Backward-compatible – single text requests (old format) are still supported. Handler auto-detects batch vs single based on the `"texts"` key.
|
| 858 |
+
|
| 859 |
+
### Timeout Configuration
|
| 860 |
+
Timeout is dynamically calculated: `num_texts × 20 + 30` seconds.
|
| 861 |
+
For large batches (20+ texts), set RunPod endpoint max execution time to 600 s.
|
| 862 |
+
|
| 863 |
+
### Cost Comparison
|
| 864 |
+
|
| 865 |
+
| Scenario | OLD (parallel) | NEW (batched) | Savings |
|
| 866 |
+
|----------|---------------|---------------|---------|
|
| 867 |
+
| 2 texts | 2 workers × 18 s | 1 worker × 38 s | ~50% |
|
| 868 |
+
| 10 texts | 10 workers × 18 s | 1 worker × 190 s | ~55% |
|
| 869 |
+
| 25 texts | 25 workers × 18 s | 1 worker × 480 s | ~60% |
|
| 870 |
+
|
| 871 |
+
### Integration Test
|
| 872 |
+
```bash
|
| 873 |
+
cd api
|
| 874 |
+
python test_runpod_integration.py
|
| 875 |
+
```
|
Dockerfile
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# DocGenie API + Worker - Dockerfile (Minimal)
|
| 3 |
+
# ============================================
|
| 4 |
+
# Adapted for Hugging Face Spaces (Docker SDK):
|
| 5 |
+
# - Non-root user (UID 1000) — HF Spaces requirement
|
| 6 |
+
# - Port 7860 — HF Spaces default
|
| 7 |
+
# - Playwright browsers in user-owned path
|
| 8 |
+
|
| 9 |
+
FROM python:3.11-slim
|
| 10 |
+
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Install runtime system dependencies
|
| 14 |
+
RUN apt-get update && apt-get install -y \
|
| 15 |
+
wget \
|
| 16 |
+
gnupg \
|
| 17 |
+
poppler-utils \
|
| 18 |
+
tesseract-ocr \
|
| 19 |
+
tesseract-ocr-eng \
|
| 20 |
+
libglib2.0-0 \
|
| 21 |
+
libnss3 \
|
| 22 |
+
libnspr4 \
|
| 23 |
+
libdbus-1-3 \
|
| 24 |
+
libatk1.0-0 \
|
| 25 |
+
libatk-bridge2.0-0 \
|
| 26 |
+
libcups2 \
|
| 27 |
+
libdrm2 \
|
| 28 |
+
libxkbcommon0 \
|
| 29 |
+
libxcomposite1 \
|
| 30 |
+
libxdamage1 \
|
| 31 |
+
libxfixes3 \
|
| 32 |
+
libxrandr2 \
|
| 33 |
+
libgbm1 \
|
| 34 |
+
libasound2 \
|
| 35 |
+
libpango-1.0-0 \
|
| 36 |
+
libcairo2 \
|
| 37 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 38 |
+
|
| 39 |
+
# Install pip packages (no uv needed - simpler)
|
| 40 |
+
COPY api/requirements.txt ./api/requirements.txt
|
| 41 |
+
RUN pip install --no-cache-dir -r api/requirements.txt
|
| 42 |
+
|
| 43 |
+
# Copy ONLY the docgenie modules needed by API (not the full package)
|
| 44 |
+
COPY docgenie/__init__.py ./docgenie/__init__.py
|
| 45 |
+
COPY docgenie/logging.py ./docgenie/logging.py
|
| 46 |
+
COPY docgenie/generation ./docgenie/generation
|
| 47 |
+
COPY data/prompt_templates ./data/prompt_templates
|
| 48 |
+
COPY data/visual_element_prefabs ./data/visual_element_prefabs
|
| 49 |
+
|
| 50 |
+
# Copy API code
|
| 51 |
+
COPY api ./api
|
| 52 |
+
|
| 53 |
+
# Copy startup script
|
| 54 |
+
COPY start.sh ./start.sh
|
| 55 |
+
RUN chmod +x start.sh
|
| 56 |
+
|
| 57 |
+
# Clean up Python cache
|
| 58 |
+
RUN find /usr/local/lib/python3.11/site-packages -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
|
| 59 |
+
find /usr/local/lib/python3.11/site-packages -name "*.pyc" -delete
|
| 60 |
+
|
| 61 |
+
# -------------------------------------------------------
|
| 62 |
+
# Non-root user setup — required by Hugging Face Spaces
|
| 63 |
+
# -------------------------------------------------------
|
| 64 |
+
RUN useradd -m -u 1000 user
|
| 65 |
+
|
| 66 |
+
# Install Playwright system dependencies as root (requires apt — must run before USER switch)
|
| 67 |
+
RUN playwright install-deps chromium
|
| 68 |
+
|
| 69 |
+
# Create writable directories and hand ownership to user
|
| 70 |
+
RUN mkdir -p /tmp/docgenie /home/user/.cache/playwright && \
|
| 71 |
+
chown -R user:user /app /tmp/docgenie /home/user
|
| 72 |
+
|
| 73 |
+
# Switch to non-root user for all runtime operations
|
| 74 |
+
USER user
|
| 75 |
+
|
| 76 |
+
# Set environment variables
|
| 77 |
+
ENV HOME=/home/user \
|
| 78 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 79 |
+
PYTHONUNBUFFERED=1 \
|
| 80 |
+
PYTHONPATH=/app \
|
| 81 |
+
PORT=7860 \
|
| 82 |
+
PLAYWRIGHT_BROWSERS_PATH=/home/user/.cache/playwright
|
| 83 |
+
|
| 84 |
+
# Download Playwright Chromium browser binary into user-owned cache directory
|
| 85 |
+
# (browser download only — system deps already installed above as root)
|
| 86 |
+
RUN playwright install chromium
|
| 87 |
+
|
| 88 |
+
# Expose port 7860 (Hugging Face Spaces default)
|
| 89 |
+
EXPOSE 7860
|
| 90 |
+
|
| 91 |
+
# Health check
|
| 92 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 93 |
+
CMD python -c "import requests; requests.get('http://localhost:7860/health')"
|
| 94 |
+
|
| 95 |
+
# Start command — shell script handles API + RQ worker
|
| 96 |
+
CMD ["./start.sh"]
|
GENERATION_PIPELINE_DOCUMENTATION.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LLM_PROJECT_CONTEXT_NOTE.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DocGenie Project Context Note (LLM Ready)
|
| 2 |
+
|
| 3 |
+
## 1) Executive Summary
|
| 4 |
+
DocGenie is an AI-driven synthetic document generation platform designed to create realistic, annotated datasets for document intelligence tasks.
|
| 5 |
+
|
| 6 |
+
The project combines:
|
| 7 |
+
- LLM-based document content and layout generation
|
| 8 |
+
- PDF rendering and geometric extraction
|
| 9 |
+
- Optional handwriting synthesis (diffusion model)
|
| 10 |
+
- Optional visual element insertion (logos, stamps, barcodes, charts, photos)
|
| 11 |
+
- OCR extraction and bbox normalization
|
| 12 |
+
- Ground-truth preparation for downstream machine learning
|
| 13 |
+
- API-first and async batch workflows for production-scale generation
|
| 14 |
+
|
| 15 |
+
The core idea is to transform a small set of real seed document images plus high-level generation parameters into large, diverse, reproducible synthetic datasets suitable for training and evaluation.
|
| 16 |
+
|
| 17 |
+
## 2) Problem Statement
|
| 18 |
+
Real document datasets are expensive and slow to collect, often constrained by privacy, class imbalance, and weak annotation quality. This limits model quality for tasks like DocVQA, KIE, and layout understanding.
|
| 19 |
+
|
| 20 |
+
Key challenges:
|
| 21 |
+
- Lack of large high-quality labeled datasets
|
| 22 |
+
- Domain mismatch between training and production documents
|
| 23 |
+
- Manual labeling cost and inconsistency
|
| 24 |
+
- Need for handwriting and visual artifacts in realistic layouts
|
| 25 |
+
- Need for reproducibility and controllable data generation
|
| 26 |
+
|
| 27 |
+
## 3) Proposed Solution
|
| 28 |
+
DocGenie proposes a modular synthetic dataset engine with controllable realism.
|
| 29 |
+
|
| 30 |
+
High-level solution flow:
|
| 31 |
+
1. Select and ingest seed images that represent target document style.
|
| 32 |
+
2. Use LLM prompting (vision + text) to generate HTML/CSS-based document variants and structured GT.
|
| 33 |
+
3. Render HTML to PDF and extract text geometry/bboxes.
|
| 34 |
+
4. Optionally replace selected text with generated handwriting.
|
| 35 |
+
5. Optionally insert visual elements (stamp/logo/barcode/photo/figure).
|
| 36 |
+
6. Produce final PDFs/images + OCR + normalized bboxes + verified GT + export packages.
|
| 37 |
+
|
| 38 |
+
Design principles:
|
| 39 |
+
- Stage-wise pipeline (clear inputs/outputs per stage)
|
| 40 |
+
- Reproducibility via seeds
|
| 41 |
+
- Production-ready API endpoints
|
| 42 |
+
- Async job orchestration for large runs
|
| 43 |
+
- Separation of CPU API workloads and GPU handwriting inference workloads
|
| 44 |
+
|
| 45 |
+
## 4) Project Goals
|
| 46 |
+
Primary goals:
|
| 47 |
+
- Generate realistic synthetic documents at scale
|
| 48 |
+
- Support multiple document AI tasks with rich annotation
|
| 49 |
+
- Provide configurable realism controls (handwriting ratio, visual element types, OCR toggles)
|
| 50 |
+
- Minimize generation cost with batched LLM calls
|
| 51 |
+
- Enable operational deployment with monitoring and async processing
|
| 52 |
+
|
| 53 |
+
Secondary goals:
|
| 54 |
+
- Improve dataset diversity through seed and prompt strategies
|
| 55 |
+
- Support rapid experimentation for model development
|
| 56 |
+
- Keep architecture modular for independent upgrades
|
| 57 |
+
|
| 58 |
+
## 5) Core Capabilities
|
| 59 |
+
- Seed-image-guided generation (1-8 images per request)
|
| 60 |
+
- Configurable document language/type and GT format
|
| 61 |
+
- Multi-output generation per seed set (num_solutions)
|
| 62 |
+
- Handwriting synthesis with writer-style consistency
|
| 63 |
+
- Visual element synthesis and insertion
|
| 64 |
+
- OCR extraction from final rendered artifacts
|
| 65 |
+
- Normalized bbox outputs for ML pipelines
|
| 66 |
+
- Optional dataset packaging/export (for training pipelines)
|
| 67 |
+
- Async batch generation with status polling and result retrieval
|
| 68 |
+
|
| 69 |
+
## 6) 19-Stage Pipeline (Conceptual)
|
| 70 |
+
DocGenie follows a full multi-stage pipeline:
|
| 71 |
+
1. Seed selection/download
|
| 72 |
+
2. Prompt LLM
|
| 73 |
+
3. Process LLM response and extract HTML/GT
|
| 74 |
+
4. Render PDF and extract geometries
|
| 75 |
+
5. Extract text bboxes
|
| 76 |
+
6. Validate generated artifacts
|
| 77 |
+
7. Extract handwriting region definitions
|
| 78 |
+
8. Extract visual element definitions
|
| 79 |
+
9. Generate handwriting images
|
| 80 |
+
10. Generate visual element images
|
| 81 |
+
11. Re-render PDF (without placeholders where required)
|
| 82 |
+
12. Insert handwriting overlays
|
| 83 |
+
13. Insert visual overlays
|
| 84 |
+
14. Render document images
|
| 85 |
+
15. Run OCR
|
| 86 |
+
16. Normalize bboxes
|
| 87 |
+
17. Prepare/verify GT
|
| 88 |
+
18. Analyze run statistics
|
| 89 |
+
19. Create debug/export outputs
|
| 90 |
+
|
| 91 |
+
Important detail:
|
| 92 |
+
- Browser geometries are often in 96 DPI and PDF geometry in 72 DPI, requiring coordinate transforms.
|
| 93 |
+
- Handwriting insertion requires text-to-bbox matching and deduplication logic.
|
| 94 |
+
|
| 95 |
+
## 7) API Product Surface
|
| 96 |
+
Main API behavior is centered around three use patterns:
|
| 97 |
+
|
| 98 |
+
1) Synchronous generation endpoint
|
| 99 |
+
- Returns generated documents and metadata directly in response.
|
| 100 |
+
- Suitable for development and debugging.
|
| 101 |
+
|
| 102 |
+
2) Synchronous PDF/ZIP artifact endpoint
|
| 103 |
+
- Returns packaged artifacts (PDF, metadata, optional assets) in downloadable form.
|
| 104 |
+
- Suitable for practical batch outputs.
|
| 105 |
+
|
| 106 |
+
3) Asynchronous batch endpoint
|
| 107 |
+
- Queues long-running generation jobs.
|
| 108 |
+
- Returns request/task id.
|
| 109 |
+
- Client polls status endpoint.
|
| 110 |
+
- Client fetches/downloads final output when completed.
|
| 111 |
+
- Best for production and larger workloads.
|
| 112 |
+
|
| 113 |
+
Typical request dimensions:
|
| 114 |
+
- seed_images: list of remote URLs
|
| 115 |
+
- prompt_params: language, doc type, GT settings, feature toggles, reproducibility seed
|
| 116 |
+
|
| 117 |
+
## 8) System Architecture
|
| 118 |
+
Monorepo-style architecture with independent service boundaries:
|
| 119 |
+
|
| 120 |
+
A) Core package
|
| 121 |
+
- Shared generation logic and pipeline stages.
|
| 122 |
+
|
| 123 |
+
B) API service (CPU)
|
| 124 |
+
- FastAPI interface
|
| 125 |
+
- Orchestrates generation pipeline
|
| 126 |
+
- Manages async queue and external integrations
|
| 127 |
+
|
| 128 |
+
C) Background worker (CPU)
|
| 129 |
+
- Executes queued async jobs
|
| 130 |
+
- Handles long-running generation and packaging workflows
|
| 131 |
+
|
| 132 |
+
D) Handwriting service (GPU)
|
| 133 |
+
- Separate service for diffusion-based handwriting generation
|
| 134 |
+
- Designed to be deployable independently
|
| 135 |
+
|
| 136 |
+
E) Data stores and platform services
|
| 137 |
+
- Queue broker (Redis)
|
| 138 |
+
- Metadata storage (database)
|
| 139 |
+
- File delivery/storage integration
|
| 140 |
+
|
| 141 |
+
Architecture intent:
|
| 142 |
+
- Keep API orchestration scalable and light
|
| 143 |
+
- Offload expensive handwriting generation to GPU service
|
| 144 |
+
- Enable independent deployment and scaling per component
|
| 145 |
+
|
| 146 |
+
## 9) Handwriting Subsystem
|
| 147 |
+
Handwriting generation is treated as a specialized capability:
|
| 148 |
+
- Uses diffusion-style generation with writer IDs/styles
|
| 149 |
+
- Supports per-word token generation and mapping
|
| 150 |
+
- Supports post-processing (blur, anti-aliasing, cropping)
|
| 151 |
+
- Designed for realism and style consistency within a document
|
| 152 |
+
|
| 153 |
+
Operational notes:
|
| 154 |
+
- Batch handling is optimized for service cost and startup overhead
|
| 155 |
+
- Some model/sampling settings are constrained by the underlying handwriting model implementation
|
| 156 |
+
|
| 157 |
+
## 10) Visual Element Subsystem
|
| 158 |
+
Visual elements include artifacts commonly found in real documents:
|
| 159 |
+
- logos
|
| 160 |
+
- stamps
|
| 161 |
+
- barcodes
|
| 162 |
+
- photos
|
| 163 |
+
- figures/charts
|
| 164 |
+
|
| 165 |
+
Key behavior:
|
| 166 |
+
- Placeholder-based extraction from generated HTML/geometries
|
| 167 |
+
- Type normalization and filtering by request settings
|
| 168 |
+
- Coordinate-aware insertion into final PDF/image artifacts
|
| 169 |
+
|
| 170 |
+
## 11) Data and Output Contracts
|
| 171 |
+
The project outputs ML-ready artifacts with rich metadata:
|
| 172 |
+
|
| 173 |
+
Typical outputs:
|
| 174 |
+
- Generated HTML/CSS
|
| 175 |
+
- Intermediate and final PDFs
|
| 176 |
+
- Rasterized page images
|
| 177 |
+
- Word/segment/layout bboxes
|
| 178 |
+
- Normalized coordinate variants
|
| 179 |
+
- Handwriting images and maps
|
| 180 |
+
- Visual element images and maps
|
| 181 |
+
- Ground-truth objects (task dependent)
|
| 182 |
+
- Optional packaged export for training pipelines
|
| 183 |
+
|
| 184 |
+
This enables direct use for training/evaluation datasets, debugging, and pipeline QA.
|
| 185 |
+
|
| 186 |
+
## 12) Deployment Strategy (Current Direction)
|
| 187 |
+
Recommended deployment split:
|
| 188 |
+
- API + worker on CPU-friendly platform
|
| 189 |
+
- Handwriting service on GPU-capable platform
|
| 190 |
+
- Redis and database as managed services
|
| 191 |
+
|
| 192 |
+
Why this split works:
|
| 193 |
+
- Different resource profiles (CPU orchestration vs GPU inference)
|
| 194 |
+
- Independent scaling and cost control
|
| 195 |
+
- Service isolation improves reliability and debugging
|
| 196 |
+
|
| 197 |
+
## 13) Testing and Quality Strategy
|
| 198 |
+
Project testing plan emphasizes:
|
| 199 |
+
- Unit tests per critical stage function
|
| 200 |
+
- Integration tests for service boundaries (LLM, handwriting service, queue)
|
| 201 |
+
- System tests for end-to-end generation
|
| 202 |
+
- Non-functional tests: performance, reliability, scalability, security
|
| 203 |
+
|
| 204 |
+
Key risk areas tested heavily:
|
| 205 |
+
- External API failures/retries
|
| 206 |
+
- Geometry and bbox alignment
|
| 207 |
+
- Async job state transitions
|
| 208 |
+
- Handwriting/visual overlay correctness
|
| 209 |
+
|
| 210 |
+
## 14) Known Constraints and Practical Considerations
|
| 211 |
+
- Quality depends on seed representativeness and prompt quality.
|
| 212 |
+
- External service availability (LLM providers, handwriting endpoint) impacts runtime reliability.
|
| 213 |
+
- Coordinate conversion and matching edge cases can affect overlay precision.
|
| 214 |
+
- Large batch jobs require async orchestration and observability.
|
| 215 |
+
- Some advanced generation realism features may still be iterative/improving.
|
| 216 |
+
|
| 217 |
+
## 15) Why This Project Matters
|
| 218 |
+
DocGenie addresses a real bottleneck in document AI: obtaining large, diverse, high-quality labeled training data.
|
| 219 |
+
|
| 220 |
+
It provides a controllable synthetic data engine that can:
|
| 221 |
+
- accelerate experimentation
|
| 222 |
+
- reduce dependence on private data access
|
| 223 |
+
- improve model robustness through diversity and controlled perturbations
|
| 224 |
+
- support multiple document AI tasks in one platform
|
| 225 |
+
|
| 226 |
+
## 16) Suggested Prompt Context for Future LLM Tasks
|
| 227 |
+
Use the following when asking an LLM to help with this codebase:
|
| 228 |
+
|
| 229 |
+
Project summary:
|
| 230 |
+
I am working on DocGenie, a synthetic document generation platform with a 19-stage pipeline. It uses LLM-generated HTML/CSS from seed images, renders to PDF, extracts bboxes/geometries, optionally inserts diffusion-generated handwriting and visual elements, runs OCR, normalizes bboxes, verifies GT, and exports ML-ready artifacts. The system has a FastAPI service, async worker, and separate GPU handwriting service.
|
| 231 |
+
|
| 232 |
+
Primary objective:
|
| 233 |
+
Improve reliability, generation quality, and production scalability of synthetic dataset generation for DocVQA/KIE/layout tasks.
|
| 234 |
+
|
| 235 |
+
Technical priorities:
|
| 236 |
+
- API and worker robustness
|
| 237 |
+
- bbox/geometry correctness
|
| 238 |
+
- handwriting and visual insertion accuracy
|
| 239 |
+
- async job reliability and observability
|
| 240 |
+
- deployment and cost optimization
|
| 241 |
+
|
| 242 |
+
Constraints:
|
| 243 |
+
- External dependencies (LLM APIs, managed queue/db, GPU service)
|
| 244 |
+
- need reproducibility through seeded runs
|
| 245 |
+
- preserve compatibility of output metadata for downstream ML pipelines
|
| 246 |
+
|
| 247 |
+
When proposing changes:
|
| 248 |
+
- Keep stage boundaries clear
|
| 249 |
+
- Avoid breaking output contracts
|
| 250 |
+
- Include failure handling and retries
|
| 251 |
+
- Prefer measurable improvements (latency, cost, quality, reliability)
|
| 252 |
+
|
| 253 |
+
## 17) Fast Context Snapshot (Short Version)
|
| 254 |
+
DocGenie is an API-first synthetic document dataset generator for document AI. It takes seed images and generation settings, uses an LLM to generate document HTML/GT, renders PDFs, extracts geometry, optionally adds handwriting and visual artifacts, runs OCR, normalizes annotations, and returns/exports ML-ready data. It is built as a modular 19-stage pipeline with async job processing and a separate GPU handwriting service for scalable production usage.
|
README.md
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: DocGenie API
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# DocGenie
|
| 12 |
+
|
| 13 |
+
## Project structure
|
| 14 |
+
The source code under /docgenie is split into three parts:
|
| 15 |
+
- **generation**: Code responsible for synthesizing datasets.
|
| 16 |
+
- **evaluation**: Code responsible for training models on original/synthetic data and evaluating them. Also contains code to load these datasets.
|
| 17 |
+
- **analyzation**: Code responsible for analyting original/synthetic data, e.g. clustering, LayoutFID scores etc.
|
| 18 |
+
|
| 19 |
+
## Setting up project dependencies
|
| 20 |
+
Install uv astral (https://docs.astral.sh/uv/getting-started/installation/)
|
| 21 |
+
```
|
| 22 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
Install dependencies (set uv cache dir to appropriate dir in your data folder as default home cache dir has limited space):
|
| 26 |
+
```
|
| 27 |
+
uv sync --cache-dir /data/proj/$USER/.cache/uv/
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Source the uv environment
|
| 31 |
+
```
|
| 32 |
+
source .venv/bin/active
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
Or, directly run commands with uv run
|
| 36 |
+
```
|
| 37 |
+
uv run python /path/to/script
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## Setting up dependencies for generation pipeline
|
| 41 |
+
Install playwright chromium by running
|
| 42 |
+
```
|
| 43 |
+
playwright install chromium
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
and also download chromium for PDF conversion:
|
| 47 |
+
```
|
| 48 |
+
wget -O chrome.zip "https://download-chromium.appspot.com/dl/Linux_x64?type=snapshots"
|
| 49 |
+
unzip chrome.zip
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
Add Chromium to your PATH
|
| 53 |
+
```
|
| 54 |
+
echo "export PATH=\"$(pwd)/chrome-linux:\$PATH\"" >> ~/.bashrc
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Reload your shell
|
| 58 |
+
```
|
| 59 |
+
source ~/.bashrc
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
Verify installation
|
| 63 |
+
```
|
| 64 |
+
chrome --version
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
# Synthetization Pipeline
|
| 68 |
+
- Set the env variable ANTHROPIC_API_KEY with your Anthropic API Key
|
| 69 |
+
- Create a new syn dataset definition file in data/syn_dataset_definitions. For a template refer to docvqa-test.yaml
|
| 70 |
+
- Execute 'docgenie/generation/main.py SynDsDefFname' where SynDsDefFname is the filename of the syn dataset definition without extension
|
| 71 |
+
- Data will be stored in 'data/datasets/SynDsName' where SynDsName is field 'name' in the syn dataset definition.
|
| 72 |
+
- Final PDFs will be stored in subdirectory pdf_final
|
| 73 |
+
- Handwriting synthesis is currently not implemented, so the final PDFs will be missing text. To see the PDF with the text which has to be replaced by handwriting see PDFs in sub directory pdf_pass1
|
| 74 |
+
- Visual element insertion is currently not implemented
|
| 75 |
+
|
| 76 |
+
# DocVQA Handwriting Generation
|
| 77 |
+
|
| 78 |
+
A toolkit for generating synthetic handwriting images for document visual question answering (DocVQA) tasks. This project provides scripts to generate, process, and enhance handwritten text overlays on documents using either font-based rendering or diffusion-based deep learning models.
|
| 79 |
+
|
| 80 |
+
## Overview
|
| 81 |
+
|
| 82 |
+
This repository contains tools to:
|
| 83 |
+
- Generate synthetic handwriting from bounding box specifications
|
| 84 |
+
- Apply post-processing effects (blur, antialiasing) for realistic rendering
|
| 85 |
+
- Support multiple generation backends (font-based, diffusion model)
|
| 86 |
+
- Handle word segmentation and concatenation for long words
|
| 87 |
+
- Maintain consistent author styles across documents
|
| 88 |
+
|
| 89 |
+
## Project Structure
|
| 90 |
+
|
| 91 |
+
```
|
| 92 |
+
docvqa_handwriting_generation/
|
| 93 |
+
├── model/ # Model architecture and training utilities
|
| 94 |
+
│ ├── text_encoder.py
|
| 95 |
+
│ ├── tokenizer.py
|
| 96 |
+
│ ├── train_hugging.py
|
| 97 |
+
│ └── experiments/
|
| 98 |
+
│ └── hf_conditional_latent/
|
| 99 |
+
│ ├── config.yaml
|
| 100 |
+
│ ├── writer_id_map.json
|
| 101 |
+
│ ├── checkpoints/
|
| 102 |
+
│ └── cached_vae/
|
| 103 |
+
├── scripts/ # Generation and evaluation scripts
|
| 104 |
+
│ ├── generate_handwriting_diffusion_raw.py
|
| 105 |
+
│ ├── generate_handwriting_resized.py
|
| 106 |
+
│ ├── generate_writer_style_eval.py
|
| 107 |
+
│ └── add_handwriting_blur.py
|
| 108 |
+
└── requirements.txt
|
| 109 |
+
```
|
| 110 |
+
## Directory Structure for Hnadwritten Text Images
|
| 111 |
+
|
| 112 |
+
```
|
| 113 |
+
data/
|
| 114 |
+
├── datasets/
|
| 115 |
+
│ ├── synthesized_datasets/
|
| 116 |
+
│ ├───── DocVQA-XYZ-Dataset/
|
| 117 |
+
│ │──────── handwriting_raw_tokens/ # Directory containing folders for each doc which inturn contains images
|
| 118 |
+
│ │────────────────7cd-ef-xy456-xxx-xxx_0/ # Directory for doc named as 7cd-ef-xy456-xxx-xxx_0 etc.
|
| 119 |
+
│ │──────────────────────── hw01_0.png # Images
|
| 120 |
+
│ │──────────────────────── hw01_1.png
|
| 121 |
+
│ │──────────────────────── .
|
| 122 |
+
│ │──────────────────────── .
|
| 123 |
+
│ │──────────────────────── .
|
| 124 |
+
│ │─────────────────32xc-ef-xy456-xxx-xxx_0/
|
| 125 |
+
│ │──────────────────────── hw01_0.png
|
| 126 |
+
│ │──────────────────────── hw01_1.png
|
| 127 |
+
│ │─────────��────────────── .
|
| 128 |
+
│ │──────────────────────── .
|
| 129 |
+
│ │──────────────────────── .
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
Dataset archives unpack directly into the repository root (e.g. `docvqa-handwritten-sizes4/`, `docvqa-test/`, `docvqa-viselems/`).
|
| 133 |
+
|
| 134 |
+
## Installation
|
| 135 |
+
|
| 136 |
+
### Requirements
|
| 137 |
+
|
| 138 |
+
- Python 3.8+
|
| 139 |
+
- PyTorch (for diffusion backend)
|
| 140 |
+
- Other dependencies listed in `requirements.txt`
|
| 141 |
+
|
| 142 |
+
### Setup
|
| 143 |
+
|
| 144 |
+
1. Clone the repository:
|
| 145 |
+
```bash
|
| 146 |
+
git clone <repository-url>
|
| 147 |
+
cd docvqa_handwriting_generation
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
2. Install dependencies:
|
| 151 |
+
TODO: update pyproject.toml for dependencies, we now use UV
|
| 152 |
+
```bash
|
| 153 |
+
pip install -r requirements.txt
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
3. Download or train the diffusion model:
|
| 157 |
+
|
| 158 |
+
**Pre-trained Models:** `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
|
| 159 |
+
|
| 160 |
+
Expected structure after extraction:
|
| 161 |
+
```
|
| 162 |
+
model/
|
| 163 |
+
└── experiments/
|
| 164 |
+
└── hf_conditional_latent/
|
| 165 |
+
├── config.yaml # Model configuration
|
| 166 |
+
├── writer_id_map.json # Writer ID to index mapping
|
| 167 |
+
├── cached_vae/ # VAE decoder (auto-downloaded on first use)
|
| 168 |
+
│ ├── config.json
|
| 169 |
+
│ └── diffusion_pytorch_model.safetensors
|
| 170 |
+
└── checkpoints/
|
| 171 |
+
├── latest.pt # Latest checkpoint
|
| 172 |
+
└── checkpoint-####.pt # Epoch checkpoints
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
**Note:** The VAE decoder will be automatically downloaded from HuggingFace on first use and cached locally.
|
| 176 |
+
|
| 177 |
+
4. Download datasets (optional, for testing):
|
| 178 |
+
|
| 179 |
+
**DocVQA Handwritten Dataset:** `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
|
| 180 |
+
|
| 181 |
+
## Usage
|
| 182 |
+
|
| 183 |
+
### 1. Diffusion-Based Handwriting Generation
|
| 184 |
+
|
| 185 |
+
Generate handwriting tokens using a conditional diffusion model with writer style control and intelligent word splitting:
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
python scripts/generate_handwriting_diffusion_raw.py \
|
| 189 |
+
--input-dir data/docvqa-handwritten-sizes4/handwriting_bbox \
|
| 190 |
+
--output-dir output/handwriting_raw_tokens \
|
| 191 |
+
--run-dir model/experiments/hf_conditional_latent \
|
| 192 |
+
--checkpoint latest.pt \
|
| 193 |
+
--steps 30 \
|
| 194 |
+
--split-length 7 \
|
| 195 |
+
--batch-size 8 \
|
| 196 |
+
--temperature 1.0 \
|
| 197 |
+
--device cuda
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
**Key Features:**
|
| 201 |
+
|
| 202 |
+
**Intelligent Word Splitting:**
|
| 203 |
+
- Words longer than `--split-length` are automatically split into segments
|
| 204 |
+
- Example: `--split-length 7` → "generation" becomes "generat" + "ion"
|
| 205 |
+
- Segments are generated separately and stitched horizontally
|
| 206 |
+
- Set `--split-length 0` to disable splitting
|
| 207 |
+
|
| 208 |
+
**Writer Style Control:**
|
| 209 |
+
- Each author gets a consistent style ID per document
|
| 210 |
+
- Style IDs are derived from the model's trained writer embeddings
|
| 211 |
+
- Maintains style consistency across all words from the same author
|
| 212 |
+
|
| 213 |
+
**Conditional Diffusion:**
|
| 214 |
+
- Uses HuggingFace UNet2DConditionModel with cross-attention
|
| 215 |
+
- Character-level text encoding via transformer
|
| 216 |
+
- VAE latent space generation (auto-downloads stabilityai/sd-vae-ft-mse)
|
| 217 |
+
- Configurable sampling temperature for quality/diversity tradeoff
|
| 218 |
+
|
| 219 |
+
**Arguments:**
|
| 220 |
+
- `--run-dir`: Path to model experiment directory
|
| 221 |
+
- `--checkpoint`: Checkpoint filename (default: `latest.pt`)
|
| 222 |
+
- `--steps`: Number of diffusion steps (default: 30; more = better quality)
|
| 223 |
+
- `--split-length`: Max word length before splitting (default: 7)
|
| 224 |
+
- `--temperature`: Sampling temperature (0.7-0.9 = conservative, 1.0 = standard, 1.1-1.3 = creative)
|
| 225 |
+
- `--batch-size`: Batch size for GPU efficiency (default: 8)
|
| 226 |
+
- `--use-ema`: Use EMA weights if available in checkpoint
|
| 227 |
+
|
| 228 |
+
**Output:**
|
| 229 |
+
- Images: `<output-dir>/<json_stem>/hw<id>_<word_no>.png`
|
| 230 |
+
- Mapping: `<output-dir>/raw_token_map.json`
|
| 231 |
+
|
| 232 |
+
**Output Features:**
|
| 233 |
+
- RGBA format with transparent backgrounds
|
| 234 |
+
- Tight cropping to handwriting content
|
| 235 |
+
- Word segments automatically stitched horizontally
|
| 236 |
+
- Baseline-aligned concatenation for natural appearance
|
| 237 |
+
|
| 238 |
+
### 2. Resized Handwriting Generation
|
| 239 |
+
|
| 240 |
+
Generate handwriting scaled to fit specific bounding boxes:
|
| 241 |
+
|
| 242 |
+
```bash
|
| 243 |
+
python scripts/generate_handwriting_resized.py \
|
| 244 |
+
--input-dir data/syn_docvqa/handwriting_bbox \
|
| 245 |
+
--output-dir output/handwriting_rendered \
|
| 246 |
+
--backend font \
|
| 247 |
+
--fonts-dir assets/fonts \
|
| 248 |
+
--max-workers 8
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
**Backends:**
|
| 252 |
+
- `font`: Pillow-based pseudo-handwriting (fast, no GPU needed)
|
| 253 |
+
- `diffusion`: Deep learning model (requires GPU, model artifacts)
|
| 254 |
+
|
| 255 |
+
**Output:**
|
| 256 |
+
- Images: `<output-dir>/<json_stem>__<hw_id>__seg<index>.png`
|
| 257 |
+
- Mapping: `<output-dir>/handwriting_image_map.json`
|
| 258 |
+
|
| 259 |
+
### 3. Post-Processing with Blur
|
| 260 |
+
|
| 261 |
+
Add realistic blur and anti-aliasing to generated handwriting:
|
| 262 |
+
|
| 263 |
+
```bash
|
| 264 |
+
python scripts/add_handwriting_blur.py \
|
| 265 |
+
--input-root output/handwriting_raw_tokens \
|
| 266 |
+
--output-root output/handwriting_raw_tokens_blur \
|
| 267 |
+
--mapping-json output/handwriting_raw_tokens/raw_token_map.json \
|
| 268 |
+
--append-mapping \
|
| 269 |
+
--radius-min 0.6 \
|
| 270 |
+
--radius-max 1.8 \
|
| 271 |
+
--antialias
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
**Features:**
|
| 275 |
+
- Gaussian blur with configurable radius
|
| 276 |
+
- Optional downscale+upscale anti-aliasing
|
| 277 |
+
- Advanced edge refinement (erosion, dilation, unsharp mask)
|
| 278 |
+
- Updates mapping JSON with blurred image paths
|
| 279 |
+
- Supports in-place or mirror directory output
|
| 280 |
+
|
| 281 |
+
### 4. Writer Style Evaluation Exports
|
| 282 |
+
|
| 283 |
+
Generate per-writer evaluation samples with a curated word list and DPM-Solver++ sampling:
|
| 284 |
+
|
| 285 |
+
```bash
|
| 286 |
+
python scripts/generate_writer_style_eval.py \
|
| 287 |
+
--run-dir model/experiments/hf_conditional_latent \
|
| 288 |
+
--checkpoint latest.pt \
|
| 289 |
+
--output-dir writer_eval \
|
| 290 |
+
--max-words 48 \
|
| 291 |
+
--batch-size 12 \
|
| 292 |
+
--num-steps 30 \
|
| 293 |
+
--temperature 0.7 \
|
| 294 |
+
--device cuda
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
**Outputs:**
|
| 298 |
+
- PNG samples saved under `<output-dir>/writer_XXXX/`
|
| 299 |
+
- `<output-dir>/writer_style_manifest.json` summarizing words, writers, and generation metadata
|
| 300 |
+
|
| 301 |
+
## Input Format
|
| 302 |
+
|
| 303 |
+
### Handwriting Bbox JSON
|
| 304 |
+
|
| 305 |
+
Input JSON files specify bounding boxes and text for handwriting generation:
|
| 306 |
+
|
| 307 |
+
```json
|
| 308 |
+
[
|
| 309 |
+
{
|
| 310 |
+
"id": "hw0",
|
| 311 |
+
"text": "Example Text",
|
| 312 |
+
"author-id": "author1",
|
| 313 |
+
"bboxes": [
|
| 314 |
+
"110.69,124.79,161.76,143.41,Example,22,0,0",
|
| 315 |
+
"166.85,124.79,204.83,143.41,Text,22,0,1"
|
| 316 |
+
]
|
| 317 |
+
}
|
| 318 |
+
]
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
**Bbox format:** `x1,y1,x2,y2,text,block_no,line_no,word_no`
|
| 322 |
+
- Coordinates are floats
|
| 323 |
+
- Last 3 values are indices for grouping (block, line, word)
|
| 324 |
+
- Text can contain any characters (including commas)
|
| 325 |
+
|
| 326 |
+
## Key Features
|
| 327 |
+
|
| 328 |
+
### Intelligent Word Splitting
|
| 329 |
+
- Automatically splits words exceeding `--split-length` characters
|
| 330 |
+
- Example: "generation" (10 chars) → "generat" + "ion" (with split_length=7)
|
| 331 |
+
- Segments generated independently with same style
|
| 332 |
+
- Stitched horizontally with baseline alignment
|
| 333 |
+
- Configurable via `--split-length` parameter (0 = no splitting)
|
| 334 |
+
|
| 335 |
+
### Writer Style Consistency
|
| 336 |
+
- Each author ID gets consistent style per document
|
| 337 |
+
- Style derived from trained writer embeddings in model
|
| 338 |
+
- Falls back to deterministic hashing for unknown authors
|
| 339 |
+
- Reproducible with same `--seed` value
|
| 340 |
+
|
| 341 |
+
### Conditional Text Generation
|
| 342 |
+
- Character-level transformer text encoder
|
| 343 |
+
- Cross-attention conditioning in UNet
|
| 344 |
+
- VAE latent space generation (64×256 latent → decoded to full resolution)
|
| 345 |
+
- Temperature control for quality/diversity tradeoff
|
| 346 |
+
|
| 347 |
+
### Batched GPU Generation
|
| 348 |
+
- Process multiple segments in parallel
|
| 349 |
+
- Configurable batch size for memory optimization
|
| 350 |
+
- Progress tracking with tqdm
|
| 351 |
+
|
| 352 |
+
### Output Quality
|
| 353 |
+
- RGBA format with transparent backgrounds
|
| 354 |
+
- Tight cropping to ink extents
|
| 355 |
+
- Otsu thresholding for clean binarization
|
| 356 |
+
- Baseline-aligned word segment stitching
|
| 357 |
+
- Version-controlled output mappings
|
| 358 |
+
|
| 359 |
+
## Advanced Options
|
| 360 |
+
|
| 361 |
+
### Diffusion Generation Parameters
|
| 362 |
+
- `--steps`: Number of diffusion steps (default: 30; more = higher quality, slower)
|
| 363 |
+
- Quick preview: 15-20 steps
|
| 364 |
+
- Production: 30-50 steps
|
| 365 |
+
- `--split-length`: Maximum word length before splitting (default: 7; 0 = no splitting)
|
| 366 |
+
- `--temperature`: Sampling temperature (default: 1.0)
|
| 367 |
+
- 0.7-0.9: Conservative, cleaner output
|
| 368 |
+
- 1.0: Standard sampling
|
| 369 |
+
- 1.1-1.3: Creative, more diverse
|
| 370 |
+
- `--batch-size`: Batch size for GPU processing (default: 8)
|
| 371 |
+
- `--seed`: Random seed for reproducibility (default: 42)
|
| 372 |
+
- `--use-ema`: Use EMA weights if available (improves quality)
|
| 373 |
+
|
| 374 |
+
### Blur Parameters
|
| 375 |
+
- `--radius`: Fixed blur radius (overrides min/max)
|
| 376 |
+
- `--radius-min/max`: Random uniform blur range
|
| 377 |
+
- `--antialias`: Enable downscale+upscale smoothing
|
| 378 |
+
- `--scale-factor`: Downscale factor for antialiasing (default: 0.75)
|
| 379 |
+
|
| 380 |
+
## Troubleshooting
|
| 381 |
+
|
| 382 |
+
### CUDA Out of Memory
|
| 383 |
+
- Reduce `--batch-size` to 1-4
|
| 384 |
+
- Reduce `--steps` (try 20-30)
|
| 385 |
+
- Use CPU: `--device cpu` (much slower)
|
| 386 |
+
- Close other GPU applications
|
| 387 |
+
|
| 388 |
+
### Missing Model Files
|
| 389 |
+
Ensure you have the trained model checkpoint in:
|
| 390 |
+
```
|
| 391 |
+
model/experiments/hf_conditional_latent/
|
| 392 |
+
├── config.yaml
|
| 393 |
+
├── writer_id_map.json
|
| 394 |
+
└── checkpoints/
|
| 395 |
+
└── latest.pt
|
| 396 |
+
```
|
| 397 |
+
|
| 398 |
+
The VAE decoder will be auto-downloaded on first use to:
|
| 399 |
+
```
|
| 400 |
+
model/experiments/hf_conditional_latent/cached_vae/
|
| 401 |
+
```
|
| 402 |
+
|
| 403 |
+
### Import Errors
|
| 404 |
+
Make sure all dependencies are installed:
|
| 405 |
+
```bash
|
| 406 |
+
pip install -r requirements.txt
|
| 407 |
+
```
|
| 408 |
+
|
| 409 |
+
Ensure model components are accessible:
|
| 410 |
+
```bash
|
| 411 |
+
# From project root
|
| 412 |
+
python -c "from model.text_encoder import TextEncoder; from model.tokenizer import CharTokenizer"
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
### Style Not Working
|
| 416 |
+
Check that `writer_id_map.json` exists in your run directory and contains the author IDs from your dataset.
|
| 417 |
+
|
| 418 |
+
## Model Architecture
|
| 419 |
+
|
| 420 |
+
### Components
|
| 421 |
+
- **Text Encoder**: Character-level transformer (256-dim, 6 layers, 8 heads)
|
| 422 |
+
- **UNet**: HuggingFace UNet2DConditionModel with cross-attention
|
| 423 |
+
- **VAE**: Stable Diffusion VAE (stabilityai/sd-vae-ft-mse)
|
| 424 |
+
- **Tokenizer**: Character-level with special tokens (PAD, UNK, SOS, EOS)
|
| 425 |
+
|
| 426 |
+
### Training
|
| 427 |
+
Refer to `model/train_hugging.py` and `training/config_latent.yaml` for training configuration.
|
| 428 |
+
|
| 429 |
+
## Downloads
|
| 430 |
+
|
| 431 |
+
### Pre-trained Model
|
| 432 |
+
**Required for diffusion-based generation**
|
| 433 |
+
- Download Link: `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
|
| 434 |
+
- Extract to: `model/experiments/`
|
| 435 |
+
- Required files:
|
| 436 |
+
- `config.yaml` - Model configuration
|
| 437 |
+
- `writer_id_map.json` - Writer style mappings
|
| 438 |
+
- `checkpoints/latest.pt` - Model weights
|
| 439 |
+
|
| 440 |
+
### Datasets
|
| 441 |
+
**Optional - for testing and examples**
|
| 442 |
+
- DocVQA Handwritten Dataset: `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
|
| 443 |
+
- Extract to: `data/`
|
| 444 |
+
|
| 445 |
+
## Citation
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
## License
|
| 449 |
+
|
| 450 |
+
[Specify your license here]
|
| 451 |
+
|
| 452 |
+
## Contributing
|
| 453 |
+
|
| 454 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
TESTING_PLAN.md
ADDED
|
@@ -0,0 +1,1161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Comprehensive Testing Plan & Test Cases
|
| 2 |
+
## DocGenie Synthetic Document Generation API
|
| 3 |
+
|
| 4 |
+
**Document Version**: 1.0
|
| 5 |
+
**Date**: March 4, 2026
|
| 6 |
+
**Project**: DocGenie - AI-Powered Synthetic Document Dataset Generator
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## Table of Contents
|
| 11 |
+
1. [Testing Overview](#testing-overview)
|
| 12 |
+
2. [Functional Testing](#functional-testing)
|
| 13 |
+
- [Unit Testing](#unit-testing)
|
| 14 |
+
- [Integration Testing](#integration-testing)
|
| 15 |
+
- [System Testing](#system-testing)
|
| 16 |
+
3. [Non-Functional Testing](#non-functional-testing)
|
| 17 |
+
- [Performance Testing](#performance-testing)
|
| 18 |
+
- [Security Testing](#security-testing)
|
| 19 |
+
- [Reliability Testing](#reliability-testing)
|
| 20 |
+
- [Scalability Testing](#scalability-testing)
|
| 21 |
+
- [Usability Testing](#usability-testing)
|
| 22 |
+
4. [Test Environment Setup](#test-environment-setup)
|
| 23 |
+
5. [Testing Tools & Frameworks](#testing-tools--frameworks)
|
| 24 |
+
6. [Test Execution Plan](#test-execution-plan)
|
| 25 |
+
7. [Success Criteria & Metrics](#success-criteria--metrics)
|
| 26 |
+
8. [Risk Assessment](#risk-assessment)
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## Testing Overview
|
| 31 |
+
|
| 32 |
+
### Purpose
|
| 33 |
+
This document outlines the comprehensive testing strategy for DocGenie API, ensuring quality, reliability, and performance of the synthetic document generation system across all 19 pipeline stages.
|
| 34 |
+
|
| 35 |
+
### Scope
|
| 36 |
+
- API endpoints testing (`/generate`, `/generate/pdf`, `/generate/async`)
|
| 37 |
+
- 19-stage pipeline validation
|
| 38 |
+
- External service integrations (Claude API, RunPod handwriting service)
|
| 39 |
+
- Database operations (Supabase)
|
| 40 |
+
- Background job processing (Redis Queue)
|
| 41 |
+
- Error handling and recovery mechanisms
|
| 42 |
+
|
| 43 |
+
### Testing Approach
|
| 44 |
+
- **Test-Driven Development (TDD)**: Write tests before implementation where applicable
|
| 45 |
+
- **Continuous Integration**: Automated test execution on every commit
|
| 46 |
+
- **Coverage Target**: Minimum 80% code coverage for critical paths
|
| 47 |
+
- **Risk-Based Testing**: Prioritize high-risk components (LLM integration, handwriting service)
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Functional Testing
|
| 52 |
+
|
| 53 |
+
### A.1 Unit Testing
|
| 54 |
+
|
| 55 |
+
Unit tests verify individual functions and methods in isolation. Target: 85% code coverage.
|
| 56 |
+
|
| 57 |
+
#### **A.1.1 Seed Image Processing (Stage 01)**
|
| 58 |
+
|
| 59 |
+
**Module**: `api/utils.py::download_seed_images()`
|
| 60 |
+
|
| 61 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 62 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 63 |
+
| UT-SEED-001 | Download valid image URL | Valid HTTPS URL (JPEG) | Base64-encoded image string | High |
|
| 64 |
+
| UT-SEED-002 | Download PNG format | Valid PNG URL | Base64-encoded PNG | High |
|
| 65 |
+
| UT-SEED-003 | Handle 503 timeout error | URL returning 503 | Retry 3 times, eventual success | Critical |
|
| 66 |
+
| UT-SEED-004 | Handle 502 bad gateway | URL returning 502 | Retry with exponential backoff | High |
|
| 67 |
+
| UT-SEED-005 | Handle 404 not found | Invalid URL | Raise HTTPException(400) | High |
|
| 68 |
+
| UT-SEED-006 | Handle connection timeout | Slow/unresponsive server | Retry then raise exception | Medium |
|
| 69 |
+
| UT-SEED-007 | Validate image format | Non-image URL (HTML) | Raise validation error | Medium |
|
| 70 |
+
| UT-SEED-008 | Handle oversized images | >10MB image | Process or reject gracefully | Low |
|
| 71 |
+
| UT-SEED-009 | Test retry backoff timing | Mock 503 responses | Delays: 2s, 4s, 8s | Medium |
|
| 72 |
+
| UT-SEED-010 | Test max retries exhausted | Persistent 503 errors | Raise exception after 3 attempts | High |
|
| 73 |
+
|
| 74 |
+
**Test Implementation**:
|
| 75 |
+
```python
|
| 76 |
+
# test_seed_download.py
|
| 77 |
+
import pytest
|
| 78 |
+
from api.utils import download_seed_images
|
| 79 |
+
from unittest.mock import patch, Mock
|
| 80 |
+
|
| 81 |
+
@pytest.mark.asyncio
|
| 82 |
+
async def test_download_valid_image():
|
| 83 |
+
url = "https://example.com/test.jpg"
|
| 84 |
+
with patch('httpx.AsyncClient') as mock_client:
|
| 85 |
+
mock_response = Mock()
|
| 86 |
+
mock_response.content = b'\xff\xd8\xff\xe0' # JPEG header
|
| 87 |
+
mock_client.return_value.__aenter__.return_value.get.return_value = mock_response
|
| 88 |
+
|
| 89 |
+
result = await download_seed_images([url])
|
| 90 |
+
assert len(result) == 1
|
| 91 |
+
assert isinstance(result[0], str) # base64 string
|
| 92 |
+
|
| 93 |
+
@pytest.mark.asyncio
|
| 94 |
+
async def test_download_503_retry():
|
| 95 |
+
url = "https://example.com/test.jpg"
|
| 96 |
+
with patch('httpx.AsyncClient') as mock_client:
|
| 97 |
+
# First two calls: 503, third call: success
|
| 98 |
+
responses = [
|
| 99 |
+
Mock(status_code=503, raise_for_status=Mock(side_effect=httpx.HTTPStatusError("503", request=Mock(), response=Mock()))),
|
| 100 |
+
Mock(status_code=503, raise_for_status=Mock(side_effect=httpx.HTTPStatusError("503", request=Mock(), response=Mock()))),
|
| 101 |
+
Mock(content=b'\xff\xd8\xff\xe0', raise_for_status=Mock())
|
| 102 |
+
]
|
| 103 |
+
mock_client.return_value.__aenter__.return_value.get.side_effect = responses
|
| 104 |
+
|
| 105 |
+
result = await download_seed_images([url])
|
| 106 |
+
assert len(result) == 1
|
| 107 |
+
assert mock_client.return_value.__aenter__.return_value.get.call_count == 3
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
#### **A.1.2 HTML Processing (Stage 03)**
|
| 111 |
+
|
| 112 |
+
**Module**: `api/utils.py::extract_html_documents_from_response()`
|
| 113 |
+
|
| 114 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 115 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 116 |
+
| UT-HTML-001 | Extract single HTML | LLM response with 1 HTML | List with 1 HTML document | High |
|
| 117 |
+
| UT-HTML-002 | Extract multiple HTMLs | Response with 3 HTMLs | List with 3 documents | High |
|
| 118 |
+
| UT-HTML-003 | Extract ground truth | HTML with `<script id="GT">` | GT JSON extracted, script removed | Critical |
|
| 119 |
+
| UT-HTML-004 | Handle malformed HTML | Invalid HTML tags | Parse with BeautifulSoup recovery | Medium |
|
| 120 |
+
| UT-HTML-005 | Handle missing DOCTYPE | HTML without DOCTYPE | Add DOCTYPE or flag error | Low |
|
| 121 |
+
| UT-HTML-006 | Validate CSS presence | HTML without `<style>` | Raise validation error | High |
|
| 122 |
+
| UT-HTML-007 | Extract handwriting markers | HTML with `class="handwritten"` | Identify 5 handwriting elements | High |
|
| 123 |
+
| UT-HTML-008 | Extract visual elements | HTML with `data-placeholder` | Identify 3 visual elements | High |
|
| 124 |
+
| UT-HTML-009 | Handle empty response | Empty string from LLM | Return empty list | Medium |
|
| 125 |
+
| UT-HTML-010 | Prettify minified HTML | Single-line HTML | Multi-line formatted HTML | Low |
|
| 126 |
+
|
| 127 |
+
#### **A.1.3 PDF Rendering (Stage 04)**
|
| 128 |
+
|
| 129 |
+
**Module**: `api/utils.py::render_html_to_pdf()`
|
| 130 |
+
|
| 131 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 132 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 133 |
+
| UT-PDF-001 | Render A4 document | HTML with A4 page size | PDF 210×297mm | High |
|
| 134 |
+
| UT-PDF-002 | Render Letter size | HTML with Letter page | PDF 215.9×279.4mm | Medium |
|
| 135 |
+
| UT-PDF-003 | Extract geometries | HTML with handwriting | Geometries JSON with rects | Critical |
|
| 136 |
+
| UT-PDF-004 | Handle custom fonts | HTML with @font-face | PDF with embedded fonts | Low |
|
| 137 |
+
| UT-PDF-005 | Preserve CSS styling | HTML with colors/borders | PDF matches visual style | Medium |
|
| 138 |
+
| UT-PDF-006 | Handle images in HTML | HTML with <img> tags | Images embedded in PDF | Low |
|
| 139 |
+
| UT-PDF-007 | Extract text coordinates | HTML with paragraphs | Accurate bbox coordinates | High |
|
| 140 |
+
| UT-PDF-008 | Handle landscape orientation | HTML with landscape CSS | PDF in landscape mode | Low |
|
| 141 |
+
| UT-PDF-009 | Validate page dimensions | Various page sizes | Dimensions match CSS @page | High |
|
| 142 |
+
| UT-PDF-010 | Handle Playwright errors | Browser crash scenario | Retry or graceful failure | Medium |
|
| 143 |
+
|
| 144 |
+
#### **A.1.4 Bbox Extraction (Stage 05)**
|
| 145 |
+
|
| 146 |
+
**Module**: `api/utils.py::extract_bboxes_from_rendered_pdf()`
|
| 147 |
+
|
| 148 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 149 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 150 |
+
| UT-BBOX-001 | Extract word bboxes | Standard PDF | List of word-level bboxes | Critical |
|
| 151 |
+
| UT-BBOX-002 | Extract char bboxes | Same PDF | List of char-level bboxes | High |
|
| 152 |
+
| UT-BBOX-003 | Handle multi-line text | PDF with paragraphs | Correct block/line grouping | High |
|
| 153 |
+
| UT-BBOX-004 | Filter whitespace | PDF with spaces/tabs | No whitespace-only bboxes | Medium |
|
| 154 |
+
| UT-BBOX-005 | Handle special characters | PDF with ©, ®, ™ | Characters properly extracted | Medium |
|
| 155 |
+
| UT-BBOX-006 | Handle non-Latin scripts | PDF with Chinese/Arabic | Correct unicode extraction | Low |
|
| 156 |
+
| UT-BBOX-007 | Validate coordinates | Extracted bboxes | All coords within page bounds | High |
|
| 157 |
+
| UT-BBOX-008 | Handle empty PDF | PDF with no text | Return empty list | Low |
|
| 158 |
+
| UT-BBOX-009 | Handle rotated text | PDF with rotation | Bboxes account for rotation | Low |
|
| 159 |
+
| UT-BBOX-010 | Parse bbox strings | "0_0_0 Hello 10 20 50 30" | OCRBox object with correct fields | High |
|
| 160 |
+
|
| 161 |
+
#### **A.1.5 Handwriting Region Extraction (Stage 07)**
|
| 162 |
+
|
| 163 |
+
**Module**: `api/utils.py::process_stage3_complete()` - handwriting section
|
| 164 |
+
|
| 165 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 166 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 167 |
+
| UT-HW-001 | Filter by handwriting_ratio | 10 regions, ratio=0.3 | ~3 regions selected | Critical |
|
| 168 |
+
| UT-HW-002 | Parse author IDs | `class="handwritten author1"` | author_id="author1" | High |
|
| 169 |
+
| UT-HW-003 | Match to word bboxes | Geometry + bboxes | Correct bbox mapping | Critical |
|
| 170 |
+
| UT-HW-004 | Handle signature class | `class="handwritten signature"` | is_signature=True | Medium |
|
| 171 |
+
| UT-HW-005 | DPI coordinate conversion | Browser coords (96 DPI) | PDF coords (72 DPI) with 0.75 scale | High |
|
| 172 |
+
| UT-HW-006 | Handle overlapping regions | 2 regions, same text | Prevent duplicate bbox usage | Medium |
|
| 173 |
+
| UT-HW-007 | Validate rect boundaries | Geometries with rect | Check bboxes within rect threshold | High |
|
| 174 |
+
| UT-HW-008 | Test seed reproducibility | Same seed, same input | Identical region selection | High |
|
| 175 |
+
| UT-HW-009 | Handle zero ratio | ratio=0.0 | No regions selected | Medium |
|
| 176 |
+
| UT-HW-010 | Handle full ratio | ratio=1.0 | All regions selected | Medium |
|
| 177 |
+
|
| 178 |
+
#### **A.1.6 Handwriting Service Integration**
|
| 179 |
+
|
| 180 |
+
**Module**: `api/utils.py::call_handwriting_service_batch()`
|
| 181 |
+
|
| 182 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 183 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 184 |
+
| UT-HWSVC-001 | Batch request format | 10 texts with metadata | Correct RunPod JSON format | Critical |
|
| 185 |
+
| UT-HWSVC-002 | Handle sync response | Immediate completion | Parse output.images[] | High |
|
| 186 |
+
| UT-HWSVC-003 | Handle IN_PROGRESS | Delayed completion | Poll status endpoint | Critical |
|
| 187 |
+
| UT-HWSVC-004 | Status polling timeout | Job exceeds 30 polls | Raise timeout exception | High |
|
| 188 |
+
| UT-HWSVC-005 | Handle FAILED status | RunPod job failure | Raise exception with error | High |
|
| 189 |
+
| UT-HWSVC-006 | Parse image results | Batch response | Map hw_id to image_base64 | Critical |
|
| 190 |
+
| UT-HWSVC-007 | Calculate dynamic timeout | 50 texts | Timeout = 50×20+30 = 1030s | Medium |
|
| 191 |
+
| UT-HWSVC-008 | Handle network errors | Connection timeout | Retry up to max_retries | High |
|
| 192 |
+
| UT-HWSVC-009 | Validate authorization | Missing API key | Request includes Bearer token | Medium |
|
| 193 |
+
| UT-HWSVC-010 | Test exponential backoff | Status polling | Delays: 5s, 6s, 7s... up to 10s | Low |
|
| 194 |
+
|
| 195 |
+
#### **A.1.7 Visual Element Generation (Stage 10)**
|
| 196 |
+
|
| 197 |
+
**Module**: `api/utils.py::generate_visual_element_images()`
|
| 198 |
+
|
| 199 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 200 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 201 |
+
| UT-VE-001 | Select logo prefab | type="logo" | Random logo from prefabs/ | High |
|
| 202 |
+
| UT-VE-002 | Select photo prefab | type="photo" | Random photo image | High |
|
| 203 |
+
| UT-VE-003 | Generate barcode | type="barcode" | EAN-13 barcode image | Medium |
|
| 204 |
+
| UT-VE-004 | Generate QR code | type="qr_code", content="URL" | QR code image | Medium |
|
| 205 |
+
| UT-VE-005 | Test seed reproducibility | Same seed, same type | Identical prefab selection | High |
|
| 206 |
+
| UT-VE-006 | Handle missing prefabs | type with no files | Fallback or error | Medium |
|
| 207 |
+
| UT-VE-007 | Load SVG prefabs | SVG logo file | Convert to PNG | Low |
|
| 208 |
+
| UT-VE-008 | Filter by requested types | types=["logo","signature"] | Only matching types generated | High |
|
| 209 |
+
| UT-VE-009 | Normalize type synonyms | "chart" → "figure" | Consistent type mapping | Medium |
|
| 210 |
+
| UT-VE-010 | Return base64 encoding | All image types | Valid base64 strings | High |
|
| 211 |
+
|
| 212 |
+
#### **A.1.8 PDF Modification (Stages 12-13)**
|
| 213 |
+
|
| 214 |
+
**Module**: `api/utils.py::process_stage3_complete()` - insertion sections
|
| 215 |
+
|
| 216 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 217 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 218 |
+
| UT-PDFMOD-001 | Whiteout text regions | 5 word bboxes | White rectangles drawn | High |
|
| 219 |
+
| UT-PDFMOD-002 | Insert handwriting image | Image + bbox | Image at correct position | Critical |
|
| 220 |
+
| UT-PDFMOD-003 | Apply random offsets | Word bbox | Position offset within limits | Medium |
|
| 221 |
+
| UT-PDFMOD-004 | Resize with aspect ratio | Wide/tall images | Scaled to fit bbox | High |
|
| 222 |
+
| UT-PDFMOD-005 | Insert visual element | Logo + rect | Centered in bbox | High |
|
| 223 |
+
| UT-PDFMOD-006 | Handle rotation | Element with rotation=45 | Rotated image insertion | Low |
|
| 224 |
+
| UT-PDFMOD-007 | Save intermediate PDF | After handwriting | _with_handwriting.pdf created | Medium |
|
| 225 |
+
| UT-PDFMOD-008 | Save final PDF | After visual elements | _final.pdf created | High |
|
| 226 |
+
| UT-PDFMOD-009 | Scale factor application | 3x upscale | High-res image quality | Medium |
|
| 227 |
+
| UT-PDFMOD-010 | Handle insertion errors | Invalid image data | Log error, continue | Medium |
|
| 228 |
+
|
| 229 |
+
#### **A.1.9 OCR Processing (Stage 15)**
|
| 230 |
+
|
| 231 |
+
**Module**: `api/utils.py::run_paddle_ocr()`
|
| 232 |
+
|
| 233 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 234 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 235 |
+
| UT-OCR-001 | OCR English text | English document image | Accurate word recognition | Critical |
|
| 236 |
+
| UT-OCR-002 | OCR with handwriting | Mixed typed/handwritten | Both text types detected | High |
|
| 237 |
+
| UT-OCR-003 | Extract word bboxes | Document image | List of word-level bboxes | Critical |
|
| 238 |
+
| UT-OCR-004 | Calculate confidence | OCR results | Confidence score per word | High |
|
| 239 |
+
| UT-OCR-005 | Handle low quality | Blurry/noisy image | Reasonable accuracy (>70%) | Medium |
|
| 240 |
+
| UT-OCR-006 | Handle rotated text | 90° rotated document | Correct orientation detection | Low |
|
| 241 |
+
| UT-OCR-007 | Multi-language support | Document with German text | lang="de" parameter works | Medium |
|
| 242 |
+
| UT-OCR-008 | Handle empty image | Blank white image | Empty results list | Low |
|
| 243 |
+
| UT-OCR-009 | DPI configuration | Various DPI settings | Consistent accuracy | Medium |
|
| 244 |
+
| UT-OCR-010 | Return image dimensions | Any image | width, height in pixels | High |
|
| 245 |
+
|
| 246 |
+
#### **A.1.10 Bbox Normalization (Stage 16)**
|
| 247 |
+
|
| 248 |
+
**Module**: `api/utils.py::normalize_bboxes()`
|
| 249 |
+
|
| 250 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 251 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 252 |
+
| UT-NORM-001 | Normalize to [0,1] | Pixel bboxes, image dims | Normalized coordinates | Critical |
|
| 253 |
+
| UT-NORM-002 | Handle out-of-bounds | x1 > image_width | Clipped to [0, 1] | High |
|
| 254 |
+
| UT-NORM-003 | Preserve text data | Bboxes with text field | Text preserved in output | High |
|
| 255 |
+
| UT-NORM-004 | Create segment bboxes | Word-level bboxes | Aggregated segment bboxes | Medium |
|
| 256 |
+
| UT-NORM-005 | Handle zero dimensions | Image with width=0 | Raise validation error | Low |
|
| 257 |
+
| UT-NORM-006 | Round to precision | Float coordinates | 6 decimal places | Low |
|
| 258 |
+
| UT-NORM-007 | Maintain bbox order | Ordered input list | Same order in output | Medium |
|
| 259 |
+
| UT-NORM-008 | Handle negative coords | bbox with x0=-5 | Clipped to 0 | Medium |
|
| 260 |
+
| UT-NORM-009 | Validate bbox format | Various input formats | Consistent output schema | High |
|
| 261 |
+
| UT-NORM-010 | Handle empty list | No bboxes | Return empty list | Low |
|
| 262 |
+
|
| 263 |
+
#### **A.1.11 Dataset Export (Stage 19)**
|
| 264 |
+
|
| 265 |
+
**Module**: `api/utils.py::export_to_msgpack()`
|
| 266 |
+
|
| 267 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 268 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 269 |
+
| UT-EXPORT-001 | Create msgpack file | Complete document data | Valid .msgpack file | Critical |
|
| 270 |
+
| UT-EXPORT-002 | Encode image bytes | PNG image | Binary image in msgpack | High |
|
| 271 |
+
| UT-EXPORT-003 | Store normalized bboxes | Normalized coordinates | Bboxes in [0,1] range | High |
|
| 272 |
+
| UT-EXPORT-004 | Store ground truth | GT JSON | GT dict in msgpack | High |
|
| 273 |
+
| UT-EXPORT-005 | Store metadata | Document metadata | Metadata dict in msgpack | Medium |
|
| 274 |
+
| UT-EXPORT-006 | Validate msgpack format | Generated file | Readable by msgpack.load() | Critical |
|
| 275 |
+
| UT-EXPORT-007 | Handle large files | 10MB+ image | Compression applied | Low |
|
| 276 |
+
| UT-EXPORT-008 | Store words list | OCR words | Ordered word list | High |
|
| 277 |
+
| UT-EXPORT-009 | Handle missing fields | Partial data | Fill with null/defaults | Medium |
|
| 278 |
+
| UT-EXPORT-010 | Return file path | Export operation | Absolute path to .msgpack | Medium |
|
| 279 |
+
|
| 280 |
+
#### **A.1.12 Validation Functions**
|
| 281 |
+
|
| 282 |
+
**Module**: `api/utils.py::validate_*()`
|
| 283 |
+
|
| 284 |
+
| Test Case ID | Test Name | Input | Expected Output | Priority |
|
| 285 |
+
|--------------|-----------|-------|-----------------|----------|
|
| 286 |
+
| UT-VAL-001 | Validate HTML structure | Valid HTML5 | (True, None) | High |
|
| 287 |
+
| UT-VAL-002 | Detect missing DOCTYPE | HTML without DOCTYPE | (False, "Missing DOCTYPE") | Medium |
|
| 288 |
+
| UT-VAL-003 | Detect missing CSS | HTML without <style> | (False, "Missing CSS") | High |
|
| 289 |
+
| UT-VAL-004 | Validate PDF file | Valid PDF | (True, None) | High |
|
| 290 |
+
| UT-VAL-005 | Detect corrupt PDF | Truncated PDF file | (False, "Corrupt PDF") | High |
|
| 291 |
+
| UT-VAL-006 | Validate bbox count | 100 bboxes, min=50 | (True, None) | Medium |
|
| 292 |
+
| UT-VAL-007 | Detect insufficient bboxes | 10 bboxes, min=50 | (False, "Insufficient bboxes") | Medium |
|
| 293 |
+
| UT-VAL-008 | Validate bbox coordinates | Valid bboxes | (True, None) | High |
|
| 294 |
+
| UT-VAL-009 | Detect invalid coordinates | x0 > x1 | (False, "Invalid bbox") | High |
|
| 295 |
+
| UT-VAL-010 | Validate page count | Multi-page PDF | (False, "Expected 1 page") | Medium |
|
| 296 |
+
|
| 297 |
+
**Total Unit Tests**: 120+ test cases
|
| 298 |
+
|
| 299 |
+
---
|
| 300 |
+
|
| 301 |
+
### A.2 Integration Testing
|
| 302 |
+
|
| 303 |
+
Integration tests verify interactions between multiple components. Target: Complete workflow coverage.
|
| 304 |
+
|
| 305 |
+
#### **A.2.1 Pipeline Stage Integration**
|
| 306 |
+
|
| 307 |
+
**Purpose**: Verify data flow between consecutive pipeline stages
|
| 308 |
+
|
| 309 |
+
| Test Case ID | Test Name | Components | Test Scenario | Priority |
|
| 310 |
+
|--------------|-----------|------------|---------------|----------|
|
| 311 |
+
| IT-PIPE-001 | Stages 01-03 integration | Seed download → LLM → HTML extraction | Download seeds, call LLM, extract HTML successfully | Critical |
|
| 312 |
+
| IT-PIPE-002 | Stages 03-05 integration | HTML extraction → PDF render → Bbox extraction | Clean HTML renders to PDF, bboxes extracted | Critical |
|
| 313 |
+
| IT-PIPE-003 | Stages 07-09 integration | HW extraction → Service call | HW regions trigger service batch request | Critical |
|
| 314 |
+
| IT-PIPE-004 | Stages 09-12 integration | HW generation → Insertion | Generated images inserted at correct positions | Critical |
|
| 315 |
+
| IT-PIPE-005 | Stages 14-15 integration | Image render → OCR | Final image passed to OCR successfully | High |
|
| 316 |
+
| IT-PIPE-006 | Stages 15-16 integration | OCR → Normalization | OCR bboxes normalized with correct dimensions | High |
|
| 317 |
+
| IT-PIPE-007 | Stages 07-13 complete | Full Stage 3 | Handwriting + visual elements end-to-end | Critical |
|
| 318 |
+
| IT-PIPE-008 | Stages 14-19 complete | Full Stages 4-5 | OCR → export complete workflow | High |
|
| 319 |
+
| IT-PIPE-009 | Stages 01-19 minimal | End-to-end minimal | No handwriting/VE, basic generation | Critical |
|
| 320 |
+
| IT-PIPE-010 | Stages 01-19 full | End-to-end full features | All features enabled, complete dataset | Critical |
|
| 321 |
+
|
| 322 |
+
#### **A.2.2 External Service Integration**
|
| 323 |
+
|
| 324 |
+
**Purpose**: Verify interactions with external APIs and services
|
| 325 |
+
|
| 326 |
+
| Test Case ID | Test Name | Services | Test Scenario | Priority |
|
| 327 |
+
|--------------|-----------|----------|---------------|----------|
|
| 328 |
+
| IT-EXT-001 | Claude API integration | Claude Messages API | Send prompt, receive valid response | Critical |
|
| 329 |
+
| IT-EXT-002 | Claude error handling | Claude API | Handle rate limits (429) gracefully | High |
|
| 330 |
+
| IT-EXT-003 | Claude retry logic | Claude API | Automatic retry on transient errors | High |
|
| 331 |
+
| IT-EXT-004 | RunPod sync integration | RunPod /runsync | Send batch, receive images | Critical |
|
| 332 |
+
| IT-EXT-005 | RunPod async integration | RunPod /run + status | Queue job, poll until completion | High |
|
| 333 |
+
| IT-EXT-006 | RunPod auth | RunPod API | Bearer token authentication works | Medium |
|
| 334 |
+
| IT-EXT-007 | Supabase storage | Supabase storage API | Upload/download seed images | Medium |
|
| 335 |
+
| IT-EXT-008 | Supabase database | Supabase DB | Store generation metadata | Medium |
|
| 336 |
+
| IT-EXT-009 | Redis Queue | RQ worker | Enqueue async job, process in background | High |
|
| 337 |
+
| IT-EXT-010 | Google Drive | Drive API (optional) | Export to Google Drive if configured | Low |
|
| 338 |
+
|
| 339 |
+
#### **A.2.3 Database Operations**
|
| 340 |
+
|
| 341 |
+
**Purpose**: Verify database interactions (Supabase)
|
| 342 |
+
|
| 343 |
+
| Test Case ID | Test Name | Operations | Test Scenario | Priority |
|
| 344 |
+
|--------------|-----------|------------|---------------|----------|
|
| 345 |
+
| IT-DB-001 | Insert generation record | INSERT | New generation logged in DB | High |
|
| 346 |
+
| IT-DB-002 | Update generation status | UPDATE | Status changes reflected | High |
|
| 347 |
+
| IT-DB-003 | Query by task ID | SELECT | Retrieve generation by ID | High |
|
| 348 |
+
| IT-DB-004 | Store metadata | INSERT | Complete metadata stored | Medium |
|
| 349 |
+
| IT-DB-005 | Handle connection errors | Network failure | Retry or graceful degradation | High |
|
| 350 |
+
| IT-DB-006 | Transaction rollback | Error mid-transaction | Data consistency maintained | Medium |
|
| 351 |
+
| IT-DB-007 | Concurrent updates | Multiple workers | No race conditions | Medium |
|
| 352 |
+
| IT-DB-008 | Pagination | Large result sets | Efficient pagination | Low |
|
| 353 |
+
| IT-DB-009 | Search functionality | Full-text search | Search by doc_type, language | Low |
|
| 354 |
+
| IT-DB-010 | Data retention | Cleanup old data | Archive/delete after N days | Low |
|
| 355 |
+
|
| 356 |
+
#### **A.2.4 API Endpoint Integration**
|
| 357 |
+
|
| 358 |
+
**Purpose**: Test complete request/response cycles through endpoints
|
| 359 |
+
|
| 360 |
+
| Test Case ID | Test Name | Endpoint | Test Scenario | Priority |
|
| 361 |
+
|--------------|-----------|----------|---------------|----------|
|
| 362 |
+
| IT-API-001 | GET /health | Health check | Returns 200 with system status | Critical |
|
| 363 |
+
| IT-API-002 | POST /generate | Legacy endpoint | Returns JSON with complete data | High |
|
| 364 |
+
| IT-API-003 | POST /generate/pdf | Sync PDF endpoint | Returns ZIP file download | Critical |
|
| 365 |
+
| IT-API-004 | POST /generate/async | Async endpoint | Returns task ID | Critical |
|
| 366 |
+
| IT-API-005 | GET /generate/async/status/{id} | Status check | Returns current job status | Critical |
|
| 367 |
+
| IT-API-006 | GET /generate/async/result/{id} | Result download | Returns ZIP when complete | High |
|
| 368 |
+
| IT-API-007 | Request validation | All endpoints | Invalid params rejected with 400 | High |
|
| 369 |
+
| IT-API-008 | Authentication | Protected endpoints | Requires valid API key | High |
|
| 370 |
+
| IT-API-009 | Rate limiting | All endpoints | Enforces rate limits | Medium |
|
| 371 |
+
| IT-API-010 | CORS headers | All endpoints | Correct CORS configuration | Medium |
|
| 372 |
+
|
| 373 |
+
#### **A.2.5 Background Worker Integration**
|
| 374 |
+
|
| 375 |
+
**Purpose**: Test async job processing via Redis Queue
|
| 376 |
+
|
| 377 |
+
| Test Case ID | Test Name | Components | Test Scenario | Priority |
|
| 378 |
+
|--------------|-----------|------------|---------------|----------|
|
| 379 |
+
| IT-WORKER-001 | Job enqueue | API → RQ | Job added to queue successfully | Critical |
|
| 380 |
+
| IT-WORKER-002 | Job processing | Worker → Pipeline | Worker picks up and processes job | Critical |
|
| 381 |
+
| IT-WORKER-003 | Job status updates | Worker → DB | Status updated throughout processing | High |
|
| 382 |
+
| IT-WORKER-004 | Job failure handling | Worker error | Failed job logged, error reported | High |
|
| 383 |
+
| IT-WORKER-005 | Job retry | Transient failure | Failed job retried up to max attempts | High |
|
| 384 |
+
| IT-WORKER-006 | Job timeout | Long-running job | Timeout enforced, job killed | Medium |
|
| 385 |
+
| IT-WORKER-007 | Result storage | Worker → Storage | Results saved to correct location | High |
|
| 386 |
+
| IT-WORKER-008 | Queue priority | Multiple jobs | High priority jobs processed first | Low |
|
| 387 |
+
| IT-WORKER-009 | Worker scaling | Multiple workers | Jobs distributed across workers | Medium |
|
| 388 |
+
| IT-WORKER-010 | Worker health | Worker crash | Replaced automatically, jobs reassigned | High |
|
| 389 |
+
|
| 390 |
+
**Total Integration Tests**: 50+ test cases
|
| 391 |
+
|
| 392 |
+
---
|
| 393 |
+
|
| 394 |
+
### A.3 System Testing
|
| 395 |
+
|
| 396 |
+
System tests verify end-to-end workflows from user perspective. Target: All user journeys covered.
|
| 397 |
+
|
| 398 |
+
#### **A.3.1 Complete Generation Workflows**
|
| 399 |
+
|
| 400 |
+
| Test Case ID | Test Name | Workflow | Test Scenario | Expected Outcome | Priority |
|
| 401 |
+
|--------------|-----------|----------|---------------|------------------|----------|
|
| 402 |
+
| ST-GEN-001 | Basic document generation | Minimal config | Generate 1 English invoice, no handwriting/VE | PDF + metadata returned in <60s | Critical |
|
| 403 |
+
| ST-GEN-002 | Handwriting generation | Enable handwriting | Generate document with handwriting | Handwriting visible in PDF | Critical |
|
| 404 |
+
| ST-GEN-003 | Visual elements | Enable VE | Generate document with logo + barcode | Elements visible in PDF | High |
|
| 405 |
+
| ST-GEN-004 | Full feature set | All features enabled | Generate with HW + VE + OCR + analysis | Complete dataset ZIP | Critical |
|
| 406 |
+
| ST-GEN-005 | Multi-document batch | num_solutions=5 | Generate 5 documents from 3 seeds | 5 complete documents | High |
|
| 407 |
+
| ST-GEN-006 | Reproducible generation | Same seed value | Generate twice with seed=42 | Identical outputs | High |
|
| 408 |
+
| ST-GEN-007 | Multi-language | language="german" | Generate German document | Correct language output | Medium |
|
| 409 |
+
| ST-GEN-008 | Various doc types | doc_type variations | Test invoice, receipt, form, letter | All types work | High |
|
| 410 |
+
| ST-GEN-009 | Different GT formats | gt_type="kie" / "qa" | Test both GT formats | Correct GT structure | High |
|
| 411 |
+
| ST-GEN-010 | Custom seed images | User-provided URLs | Generate from user's images | Images influence output | High |
|
| 412 |
+
|
| 413 |
+
#### **A.3.2 Error Handling Workflows**
|
| 414 |
+
|
| 415 |
+
| Test Case ID | Test Name | Error Condition | Test Scenario | Expected Outcome | Priority |
|
| 416 |
+
|--------------|-----------|-----------------|---------------|------------------|----------|
|
| 417 |
+
| ST-ERR-001 | Invalid seed URL | 404 not found | Submit invalid image URL | HTTP 400 with clear error message | High |
|
| 418 |
+
| ST-ERR-002 | LLM API failure | Claude API down | Submit request during outage | HTTP 503 with retry-after | Critical |
|
| 419 |
+
| ST-ERR-003 | Handwriting service failure | RunPod timeout | Enable handwriting, service fails | HTTP 500, generation stopped | Critical |
|
| 420 |
+
| ST-ERR-004 | Invalid parameters | Missing required field | Omit doc_type parameter | HTTP 422 with validation details | High |
|
| 421 |
+
| ST-ERR-005 | Rate limit exceeded | Too many requests | Submit 100 concurrent requests | HTTP 429 with retry info | High |
|
| 422 |
+
| ST-ERR-006 | Payload too large | Huge request | Submit 50 seed image URLs | HTTP 413 payload too large | Medium |
|
| 423 |
+
| ST-ERR-007 | Malformed JSON | Invalid JSON | Submit broken JSON request | HTTP 400 with parse error | High |
|
| 424 |
+
| ST-ERR-008 | Authentication failure | Missing/invalid API key | Request without auth | HTTP 401 unauthorized | High |
|
| 425 |
+
| ST-ERR-009 | Database connection loss | DB unavailable | Submit during DB outage | Graceful degradation or 503 | Medium |
|
| 426 |
+
| ST-ERR-010 | Disk space exhausted | No storage space | Generate large batch | HTTP 507 insufficient storage | Low |
|
| 427 |
+
|
| 428 |
+
#### **A.3.3 Async Processing Workflows**
|
| 429 |
+
|
| 430 |
+
| Test Case ID | Test Name | Workflow | Test Scenario | Expected Outcome | Priority |
|
| 431 |
+
|--------------|-----------|----------|---------------|------------------|----------|
|
| 432 |
+
| ST-ASYNC-001 | Submit async job | POST /generate/async | Submit batch job | Receive task ID immediately | Critical |
|
| 433 |
+
| ST-ASYNC-002 | Check pending status | GET status before completion | Poll status endpoint | Returns "pending" or "processing" | High |
|
| 434 |
+
| ST-ASYNC-003 | Check completed status | GET status after completion | Poll status after 5 minutes | Returns "completed" | Critical |
|
| 435 |
+
| ST-ASYNC-004 | Download results | GET result/{id} | Download after completion | Returns ZIP file | Critical |
|
| 436 |
+
| ST-ASYNC-005 | Check failed status | Job fails during processing | Check status of failed job | Returns "failed" with error details | High |
|
| 437 |
+
| ST-ASYNC-006 | Multiple concurrent jobs | Submit 10 jobs | 10 async submissions | All jobs process independently | High |
|
| 438 |
+
| ST-ASYNC-007 | Job cancellation | Cancel in-progress job | Submit, then cancel | Job stops, partial results cleaned | Medium |
|
| 439 |
+
| ST-ASYNC-008 | Result expiration | Check old results | Access 7-day old result | HTTP 410 gone (expired) | Low |
|
| 440 |
+
| ST-ASYNC-009 | Progress updates | Monitor long job | Poll during processing | Progress % increases | Medium |
|
| 441 |
+
| ST-ASYNC-010 | Worker restart recovery | Worker crashes mid-job | Kill worker process | Job reassigned, completes | High |
|
| 442 |
+
|
| 443 |
+
#### **A.3.4 Data Quality Workflows**
|
| 444 |
+
|
| 445 |
+
| Test Case ID | Test Name | Quality Check | Test Scenario | Expected Outcome | Priority |
|
| 446 |
+
|--------------|-----------|---------------|---------------|------------------|----------|
|
| 447 |
+
| ST-QUAL-001 | OCR accuracy | Compare OCR to ground truth | Generate doc, compare OCR text to GT | >90% accuracy | High |
|
| 448 |
+
| ST-QUAL-002 | Bbox alignment | Visual inspection | Generate doc with debug viz | Bboxes align with text | High |
|
| 449 |
+
| ST-QUAL-003 | Handwriting quality | Visual realism | Generate handwritten doc | Handwriting looks realistic | Medium |
|
| 450 |
+
| ST-QUAL-004 | Visual element placement | Correct positioning | Generate with logo + barcode | Elements at correct positions | High |
|
| 451 |
+
| ST-QUAL-005 | GT completeness | All GT fields present | Generate KIE document | All expected GT fields extracted | High |
|
| 452 |
+
| ST-QUAL-006 | Dataset format validity | msgpack validation | Export dataset | PyTorch can load msgpack | High |
|
| 453 |
+
| ST-QUAL-007 | Image resolution | Check output image | Render final image | Minimum 220 DPI quality | Medium |
|
| 454 |
+
| ST-QUAL-008 | PDF compliance | PDF/A validation | Generate PDF | Valid PDF/A format | Low |
|
| 455 |
+
| ST-QUAL-009 | Metadata accuracy | Check metadata fields | Generate document | Metadata matches actual data | High |
|
| 456 |
+
| ST-QUAL-010 | Reproducibility | Same input → same output | Generate 3 times with seed | All outputs identical | High |
|
| 457 |
+
|
| 458 |
+
#### **A.3.5 Performance Workflows**
|
| 459 |
+
|
| 460 |
+
| Test Case ID | Test Name | Performance Metric | Test Scenario | Target Performance | Priority |
|
| 461 |
+
|--------------|-----------|-------------------|---------------|---------------------|----------|
|
| 462 |
+
| ST-PERF-001 | Basic generation time | Time to completion | Generate minimal document | <60 seconds | High |
|
| 463 |
+
| ST-PERF-002 | Handwriting generation time | Time with HW | Generate with 20 HW words | <300 seconds | High |
|
| 464 |
+
| ST-PERF-003 | Batch generation time | Multiple documents | Generate 10 documents | <15 minutes | Medium |
|
| 465 |
+
| ST-PERF-004 | API response time | Endpoint latency | Submit request | <500ms to return task ID | High |
|
| 466 |
+
| ST-PERF-005 | Status check latency | Status endpoint | Check job status | <100ms response time | Medium |
|
| 467 |
+
| ST-PERF-006 | Concurrent requests | Load handling | 50 concurrent requests | All complete successfully | High |
|
| 468 |
+
| ST-PERF-007 | Large payload | Big request | 8 seed images, 10 solutions | Processes without timeout | Medium |
|
| 469 |
+
| ST-PERF-008 | Memory usage | Resource consumption | Generate 100 documents | <8GB RAM per worker | Medium |
|
| 470 |
+
| ST-PERF-009 | Disk I/O | Storage performance | Rapid sequential generations | No I/O bottleneck | Low |
|
| 471 |
+
| ST-PERF-010 | Network bandwidth | Data transfer | Download large result ZIP | Download completes in <60s | Low |
|
| 472 |
+
|
| 473 |
+
**Total System Tests**: 50+ test cases
|
| 474 |
+
|
| 475 |
+
---
|
| 476 |
+
|
| 477 |
+
## Non-Functional Testing
|
| 478 |
+
|
| 479 |
+
### B.1 Performance Testing
|
| 480 |
+
|
| 481 |
+
Purpose: Verify system performance under various load conditions.
|
| 482 |
+
|
| 483 |
+
#### **B.1.1 Load Testing**
|
| 484 |
+
|
| 485 |
+
**Tool**: Apache JMeter / Locust
|
| 486 |
+
|
| 487 |
+
| Test Case ID | Test Name | Load Profile | Metrics | Acceptance Criteria | Priority |
|
| 488 |
+
|--------------|-----------|--------------|---------|---------------------|----------|
|
| 489 |
+
| NFT-LOAD-001 | Normal load | 10 concurrent users, 1 hour | Throughput, response time | Avg response <5s, 0 errors | Critical |
|
| 490 |
+
| NFT-LOAD-002 | Peak load | 50 concurrent users, 30 min | Throughput, error rate | <5% error rate, response <15s | Critical |
|
| 491 |
+
| NFT-LOAD-003 | Sustained load | 25 concurrent users, 4 hours | CPU, memory, throughput | Stable resource usage, no leaks | High |
|
| 492 |
+
| NFT-LOAD-004 | Ramp-up load | 1→100 users over 30 min | System behavior | Graceful scaling or degradation | High |
|
| 493 |
+
| NFT-LOAD-005 | Spike load | Sudden 0→100 users | Response time spike | Recovers within 2 minutes | Medium |
|
| 494 |
+
|
| 495 |
+
**Test Script Example (Locust)**:
|
| 496 |
+
```python
|
| 497 |
+
# locustfile.py
|
| 498 |
+
from locust import HttpUser, task, between
|
| 499 |
+
|
| 500 |
+
class DocGenieUser(HttpUser):
|
| 501 |
+
wait_time = between(5, 15)
|
| 502 |
+
|
| 503 |
+
@task(3)
|
| 504 |
+
def generate_basic_document(self):
|
| 505 |
+
payload = {
|
| 506 |
+
"seed_images": ["https://example.com/seed1.jpg"],
|
| 507 |
+
"prompt_params": {
|
| 508 |
+
"language": "english",
|
| 509 |
+
"doc_type": "invoice",
|
| 510 |
+
"num_solutions": 1,
|
| 511 |
+
"enable_handwriting": False,
|
| 512 |
+
"enable_visual_elements": False
|
| 513 |
+
}
|
| 514 |
+
}
|
| 515 |
+
self.client.post("/generate", json=payload, timeout=120)
|
| 516 |
+
|
| 517 |
+
@task(1)
|
| 518 |
+
def check_async_status(self):
|
| 519 |
+
# Assume task_id from previous task
|
| 520 |
+
self.client.get(f"/generate/async/status/{self.task_id}")
|
| 521 |
+
```
|
| 522 |
+
|
| 523 |
+
#### **B.1.2 Stress Testing**
|
| 524 |
+
|
| 525 |
+
**Purpose**: Determine system breaking point
|
| 526 |
+
|
| 527 |
+
| Test Case ID | Test Name | Stress Condition | Metrics | Acceptance Criteria | Priority |
|
| 528 |
+
|--------------|-----------|------------------|---------|---------------------|----------|
|
| 529 |
+
| NFT-STRESS-001 | User overload | 200+ concurrent users | Max capacity | Identifies max users before failure | High |
|
| 530 |
+
| NFT-STRESS-002 | Memory stress | Generate 1000 docs without cleanup | Memory usage | OOM protection, graceful failure | High |
|
| 531 |
+
| NFT-STRESS-003 | CPU stress | Complex documents, no throttling | CPU utilization | System remains responsive | Medium |
|
| 532 |
+
| NFT-STRESS-004 | Disk stress | Fill 95% of disk space | I/O performance | Handles low disk gracefully | Medium |
|
| 533 |
+
| NFT-STRESS-005 | Network stress | Simulate slow network | Timeout handling | Appropriate timeouts, retries | Medium |
|
| 534 |
+
|
| 535 |
+
#### **B.1.3 Endurance Testing (Soak Testing)**
|
| 536 |
+
|
| 537 |
+
**Purpose**: Detect memory leaks and performance degradation over time
|
| 538 |
+
|
| 539 |
+
| Test Case ID | Test Name | Duration | Load | Metrics | Acceptance Criteria | Priority |
|
| 540 |
+
|--------------|-----------|----------|------|---------|---------------------|----------|
|
| 541 |
+
| NFT-ENDUR-001 | 24-hour test | 24 hours | 10 concurrent users | Memory, CPU over time | No memory leaks, stable performance | High |
|
| 542 |
+
| NFT-ENDUR-002 | 7-day test | 7 days | 5 concurrent users | All resources | System stable, no degradation | Medium |
|
| 543 |
+
| NFT-ENDUR-003 | Weekend load | 48 hours | Variable load | Error rate | <1% errors throughout | Medium |
|
| 544 |
+
|
| 545 |
+
#### **B.1.4 Scalability Testing**
|
| 546 |
+
|
| 547 |
+
**Purpose**: Verify horizontal and vertical scaling
|
| 548 |
+
|
| 549 |
+
| Test Case ID | Test Name | Scaling Type | Test Scenario | Acceptance Criteria | Priority |
|
| 550 |
+
|--------------|-----------|--------------|---------------|---------------------|----------|
|
| 551 |
+
| NFT-SCALE-001 | Horizontal scaling | Add workers | 1→5 workers, measure throughput | Linear throughput increase | High |
|
| 552 |
+
| NFT-SCALE-002 | Vertical scaling | Increase CPU/RAM | 2→8 cores, 4→16GB RAM | Performance improvement | Medium |
|
| 553 |
+
| NFT-SCALE-003 | Auto-scaling | Dynamic load | Trigger auto-scale rules | Scales up/down automatically | Medium |
|
| 554 |
+
| NFT-SCALE-004 | Database scaling | Database load | High concurrent DB ops | No DB bottleneck | High |
|
| 555 |
+
| NFT-SCALE-005 | Storage scaling | Large datasets | Generate 10,000 documents | Storage handles volume | Low |
|
| 556 |
+
|
| 557 |
+
#### **B.1.5 Benchmark Testing**
|
| 558 |
+
|
| 559 |
+
**Purpose**: Establish performance baselines
|
| 560 |
+
|
| 561 |
+
| Test Case ID | Component | Benchmark | Target | Priority |
|
| 562 |
+
|--------------|-----------|-----------|--------|----------|
|
| 563 |
+
| NFT-BENCH-001 | Seed download | 1 image (1MB) | <2 seconds | High |
|
| 564 |
+
| NFT-BENCH-002 | LLM call | 1 prompt (standard) | <30 seconds | Critical |
|
| 565 |
+
| NFT-BENCH-003 | PDF rendering | 1 A4 page | <3 seconds | High |
|
| 566 |
+
| NFT-BENCH-004 | Bbox extraction | 500 words | <2 seconds | Medium |
|
| 567 |
+
| NFT-BENCH-005 | Handwriting service | 10 words batch | <200 seconds | Critical |
|
| 568 |
+
| NFT-BENCH-006 | Visual element generation | 5 elements | <5 seconds | Medium |
|
| 569 |
+
| NFT-BENCH-007 | OCR processing | 1 A4 page (300 DPI) | <5 seconds | High |
|
| 570 |
+
| NFT-BENCH-008 | Msgpack export | 1 document | <2 seconds | Medium |
|
| 571 |
+
| NFT-BENCH-009 | Complete pipeline (minimal) | End-to-end | <60 seconds | Critical |
|
| 572 |
+
| NFT-BENCH-010 | Complete pipeline (full) | End-to-end with HW | <300 seconds | Critical |
|
| 573 |
+
|
| 574 |
+
---
|
| 575 |
+
|
| 576 |
+
### B.2 Security Testing
|
| 577 |
+
|
| 578 |
+
Purpose: Identify vulnerabilities and ensure data protection.
|
| 579 |
+
|
| 580 |
+
#### **B.2.1 Authentication & Authorization Testing**
|
| 581 |
+
|
| 582 |
+
| Test Case ID | Test Name | Security Control | Test Scenario | Expected Outcome | Priority |
|
| 583 |
+
|--------------|-----------|------------------|---------------|------------------|----------|
|
| 584 |
+
| NFT-SEC-001 | API key validation | Authentication | Request without API key | HTTP 401 Unauthorized | Critical |
|
| 585 |
+
| NFT-SEC-002 | Invalid API key | Authentication | Request with wrong key | HTTP 401 Unauthorized | Critical |
|
| 586 |
+
| NFT-SEC-003 | Expired API key | Token expiration | Request with expired key | HTTP 401 with renewal info | High |
|
| 587 |
+
| NFT-SEC-004 | API key rotation | Key management | Rotate keys, test old key | Old key rejected | Medium |
|
| 588 |
+
| NFT-SEC-005 | Role-based access | Authorization | User tries admin endpoint | HTTP 403 Forbidden | High |
|
| 589 |
+
| NFT-SEC-006 | Resource ownership | Authorization | User accesses other's job | HTTP 403 Forbidden | High |
|
| 590 |
+
| NFT-SEC-007 | JWT validation | Token security | Tampered JWT token | Signature validation fails | High |
|
| 591 |
+
| NFT-SEC-008 | Session hijacking | Session security | Stolen session token | Token invalidated after detection | Medium |
|
| 592 |
+
| NFT-SEC-009 | Brute force protection | Rate limiting | 100 failed auth attempts | Account locked, IP blocked | High |
|
| 593 |
+
| NFT-SEC-010 | Multi-factor auth | MFA | Admin login without MFA | MFA required | Low |
|
| 594 |
+
|
| 595 |
+
#### **B.2.2 Input Validation & Injection Testing**
|
| 596 |
+
|
| 597 |
+
| Test Case ID | Test Name | Vulnerability | Test Scenario | Expected Outcome | Priority |
|
| 598 |
+
|--------------|-----------|---------------|---------------|------------------|----------|
|
| 599 |
+
| NFT-SEC-011 | SQL injection | Injection | Inject SQL in parameters | Parameterized queries prevent injection | Critical |
|
| 600 |
+
| NFT-SEC-012 | XSS attack | Cross-site scripting | Inject `<script>` in doc_type | Input sanitized, script not executed | High |
|
| 601 |
+
| NFT-SEC-013 | Command injection | OS command injection | Inject shell commands | Commands not executed | Critical |
|
| 602 |
+
| NFT-SEC-014 | Path traversal | Directory traversal | `../../etc/passwd` in filename | Access denied | Critical |
|
| 603 |
+
| NFT-SEC-015 | SSRF attack | Server-side request forgery | seed_image URL to internal IP | Internal IPs blocked | High |
|
| 604 |
+
| NFT-SEC-016 | XXE attack | XML external entity | Upload XML with external entity | External entities disabled | Medium |
|
| 605 |
+
| NFT-SEC-017 | LLM prompt injection | Prompt manipulation | Inject ignore instructions | Prompt sandboxing prevents escape | High |
|
| 606 |
+
| NFT-SEC-018 | Buffer overflow | Memory safety | Send 10MB+ parameter | Request rejected, no crash | Medium |
|
| 607 |
+
| NFT-SEC-019 | Unicode attack | Unicode bypass | Unicode normalization tricks | Normalized before processing | Low |
|
| 608 |
+
| NFT-SEC-020 | Regex DoS | ReDoS | Complex regex in input | Timeout protection active | Medium |
|
| 609 |
+
|
| 610 |
+
#### **B.2.3 Data Protection Testing**
|
| 611 |
+
|
| 612 |
+
| Test Case ID | Test Name | Protection Mechanism | Test Scenario | Expected Outcome | Priority |
|
| 613 |
+
|--------------|-----------|---------------------|---------------|------------------|----------|
|
| 614 |
+
| NFT-SEC-021 | Data encryption at rest | Storage encryption | Check stored files | Files encrypted on disk | High |
|
| 615 |
+
| NFT-SEC-022 | Data encryption in transit | TLS/HTTPS | Inspect network traffic | All traffic over HTTPS | Critical |
|
| 616 |
+
| NFT-SEC-023 | API key exposure | Secret management | Check logs, errors | API keys never logged | Critical |
|
| 617 |
+
| NFT-SEC-024 | PII handling | Data privacy | Generate docs with PII | PII not stored beyond retention | High |
|
| 618 |
+
| NFT-SEC-025 | Data sanitization | Data cleanup | Delete job after 7 days | All data removed | High |
|
| 619 |
+
| NFT-SEC-026 | Backup encryption | Backup security | Check backup files | Backups encrypted | Medium |
|
| 620 |
+
| NFT-SEC-027 | Secure headers | HTTP headers | Check response headers | Security headers present | High |
|
| 621 |
+
| NFT-SEC-028 | CORS policy | Cross-origin security | Request from unauthorized origin | CORS policy blocks request | High |
|
| 622 |
+
| NFT-SEC-029 | Cookie security | Cookie flags | Check cookie attributes | HttpOnly, Secure, SameSite set | Medium |
|
| 623 |
+
| NFT-SEC-030 | Sensitive data in URLs | URL security | Check for secrets in URLs | No sensitive data in query params | High |
|
| 624 |
+
|
| 625 |
+
#### **B.2.4 Dependency & Supply Chain Security**
|
| 626 |
+
|
| 627 |
+
| Test Case ID | Test Name | Security Aspect | Test Method | Expected Outcome | Priority |
|
| 628 |
+
|--------------|-----------|-----------------|-------------|------------------|----------|
|
| 629 |
+
| NFT-SEC-031 | Vulnerable dependencies | CVE scanning | Run `pip-audit` | No high/critical vulnerabilities | High |
|
| 630 |
+
| NFT-SEC-032 | Outdated packages | Package versions | Check `requirements.txt` | All packages recent (<6 months) | Medium |
|
| 631 |
+
| NFT-SEC-033 | Malicious packages | Supply chain | Verify package checksums | Checksums match official registry | High |
|
| 632 |
+
| NFT-SEC-034 | License compliance | Legal compliance | Check package licenses | All licenses compatible | Low |
|
| 633 |
+
| NFT-SEC-035 | Container security | Docker image | Scan with Trivy | No critical image vulnerabilities | High |
|
| 634 |
+
|
| 635 |
+
**Security Testing Tools**:
|
| 636 |
+
- **OWASP ZAP**: Automated security scanning
|
| 637 |
+
- **Burp Suite**: Manual penetration testing
|
| 638 |
+
- **pip-audit**: Python dependency vulnerability scanning
|
| 639 |
+
- **Trivy**: Container image scanning
|
| 640 |
+
- **Bandit**: Python code security linter
|
| 641 |
+
|
| 642 |
+
---
|
| 643 |
+
|
| 644 |
+
### B.3 Reliability Testing
|
| 645 |
+
|
| 646 |
+
Purpose: Verify system stability and fault tolerance.
|
| 647 |
+
|
| 648 |
+
#### **B.3.1 Fault Tolerance Testing**
|
| 649 |
+
|
| 650 |
+
| Test Case ID | Test Name | Fault Condition | Test Scenario | Expected Outcome | Priority |
|
| 651 |
+
|--------------|-----------|-----------------|---------------|------------------|----------|
|
| 652 |
+
| NFT-REL-001 | Database failover | Primary DB failure | Kill primary DB instance | Failover to standby, no downtime | Critical |
|
| 653 |
+
| NFT-REL-002 | Worker crash recovery | Worker process crash | Kill worker mid-job | Job reassigned, completes | High |
|
| 654 |
+
| NFT-REL-003 | Network partition | Network split | Simulate network partition | System detects, retries | High |
|
| 655 |
+
| NFT-REL-004 | External API failure | Claude API down | LLM service unavailable | Graceful error, retry queue | Critical |
|
| 656 |
+
| NFT-REL-005 | Handwriting service failure | RunPod timeout | Service exceeds timeout | Exception raised, clear error | Critical |
|
| 657 |
+
| NFT-REL-006 | Disk full | No storage space | Fill disk to 100% | Rejects new jobs, alerts sent | High |
|
| 658 |
+
| NFT-REL-007 | Redis failure | Queue unavailable | Redis server down | Async jobs fail with clear error | High |
|
| 659 |
+
| NFT-REL-008 | Load balancer failure | LB goes down | Kill load balancer | Requests reach servers via backup | Medium |
|
| 660 |
+
| NFT-REL-009 | DNS resolution failure | DNS timeout | DNS server unreachable | Falls back to IP or cached DNS | Low |
|
| 661 |
+
| NFT-REL-010 | Partial service degradation | Some features down | VE prefabs missing | Skips VE, completes other features | Medium |
|
| 662 |
+
|
| 663 |
+
#### **B.3.2 Data Integrity Testing**
|
| 664 |
+
|
| 665 |
+
| Test Case ID | Test Name | Integrity Check | Test Scenario | Expected Outcome | Priority |
|
| 666 |
+
|--------------|-----------|-----------------|---------------|------------------|----------|
|
| 667 |
+
| NFT-REL-011 | Transaction atomicity | Database transactions | Simulate crash mid-transaction | Either all or no changes applied | High |
|
| 668 |
+
| NFT-REL-012 | Data corruption detection | Checksum validation | Corrupt file on disk | Corruption detected, file rejected | High |
|
| 669 |
+
| NFT-REL-013 | Concurrent write safety | Race conditions | Multiple writes to same resource | Last write wins or lock prevents | High |
|
| 670 |
+
| NFT-REL-014 | Duplicate prevention | Idempotency | Submit same request twice | Duplicate detected, not processed | Medium |
|
| 671 |
+
| NFT-REL-015 | Backup restoration | Backup recovery | Restore from backup | Data fully restored, consistent | High |
|
| 672 |
+
|
| 673 |
+
#### **B.3.3 Recovery Testing**
|
| 674 |
+
|
| 675 |
+
| Test Case ID | Test Name | Recovery Scenario | Test Procedure | Expected Outcome | Priority |
|
| 676 |
+
|--------------|-----------|-------------------|----------------|------------------|----------|
|
| 677 |
+
| NFT-REL-016 | Crash recovery | Server crash | Kill server, restart | Server recovers, in-flight jobs resume | Critical |
|
| 678 |
+
| NFT-REL-017 | Database restore | DB corruption | Restore from backup | System operational with latest data | High |
|
| 679 |
+
| NFT-REL-018 | Disaster recovery | Complete site failure | Failover to DR site | Service restored within RTO (4 hours) | Critical |
|
| 680 |
+
| NFT-REL-019 | Job queue recovery | Redis crash | Redis restart with persistence | Queued jobs not lost | High |
|
| 681 |
+
| NFT-REL-020 | Config recovery | Bad config deployment | Deploy bad config | Rollback to previous config | Medium |
|
| 682 |
+
|
| 683 |
+
---
|
| 684 |
+
|
| 685 |
+
### B.4 Scalability Testing
|
| 686 |
+
|
| 687 |
+
Purpose: Verify system can handle growth in load and data.
|
| 688 |
+
|
| 689 |
+
#### **B.4.1 Capacity Testing**
|
| 690 |
+
|
| 691 |
+
| Test Case ID | Test Name | Capacity Metric | Test Scenario | Target Capacity | Priority |
|
| 692 |
+
|--------------|-----------|-----------------|---------------|-----------------|----------|
|
| 693 |
+
| NFT-SCAL-001 | Max concurrent users | User capacity | Gradually increase users | Support 100+ concurrent users | High |
|
| 694 |
+
| NFT-SCAL-002 | Max documents per hour | Throughput | Generate continuously | Process 500+ docs/hour | High |
|
| 695 |
+
| NFT-SCAL-003 | Max queue depth | Job queue | Enqueue 10,000 jobs | Queue handles all jobs | Medium |
|
| 696 |
+
| NFT-SCAL-004 | Max dataset size | Storage | Generate large dataset | Handle 1TB+ datasets | Low |
|
| 697 |
+
| NFT-SCAL-005 | Max file size | Upload limit | Upload large seed image | Accept up to 10MB images | Medium |
|
| 698 |
+
|
| 699 |
+
#### **B.4.2 Elasticity Testing**
|
| 700 |
+
|
| 701 |
+
| Test Case ID | Test Name | Scaling Behavior | Test Scenario | Expected Outcome | Priority |
|
| 702 |
+
|--------------|-----------|------------------|---------------|------------------|----------|
|
| 703 |
+
| NFT-SCAL-006 | Scale-up | Add resources | Increase from 2→10 workers | Linear throughput increase | High |
|
| 704 |
+
| NFT-SCAL-007 | Scale-down | Remove resources | Decrease from 10→2 workers | Graceful job completion | High |
|
| 705 |
+
| NFT-SCAL-008 | Auto-scale up | Load increase | Load triggers scale-up | New instances launched | Medium |
|
| 706 |
+
| NFT-SCAL-009 | Auto-scale down | Load decrease | Low load triggers scale-down | Excess instances terminated | Medium |
|
| 707 |
+
| NFT-SCAL-010 | Burst scaling | Sudden spike | 0→100 requests instantly | Scale-up handles burst | High |
|
| 708 |
+
|
| 709 |
+
---
|
| 710 |
+
|
| 711 |
+
### B.5 Usability Testing
|
| 712 |
+
|
| 713 |
+
Purpose: Verify API ease of use and developer experience.
|
| 714 |
+
|
| 715 |
+
#### **B.5.1 API Documentation Testing**
|
| 716 |
+
|
| 717 |
+
| Test Case ID | Test Name | Documentation Aspect | Test Scenario | Expected Outcome | Priority |
|
| 718 |
+
|--------------|-----------|---------------------|---------------|------------------|----------|
|
| 719 |
+
| NFT-USAB-001 | API docs completeness | All endpoints documented | Review /docs | All endpoints, params documented | High |
|
| 720 |
+
| NFT-USAB-002 | Example accuracy | Code examples | Test all code examples | Examples work without modification | High |
|
| 721 |
+
| NFT-USAB-003 | Error messages clarity | Error documentation | Check error responses | Errors have clear messages, codes | High |
|
| 722 |
+
| NFT-USAB-004 | OpenAPI spec validity | Swagger/OpenAPI | Validate spec | Spec passes OpenAPI validation | Medium |
|
| 723 |
+
| NFT-USAB-005 | Interactive docs | Try-it-out feature | Use /docs to test | Can test endpoints in browser | Medium |
|
| 724 |
+
|
| 725 |
+
#### **B.5.2 Developer Experience Testing**
|
| 726 |
+
|
| 727 |
+
| Test Case ID | Test Name | DX Aspect | Test Scenario | Expected Outcome | Priority |
|
| 728 |
+
|--------------|-----------|-----------|---------------|------------------|----------|
|
| 729 |
+
| NFT-USAB-006 | SDK availability | Client libraries | Check for Python/JS SDKs | SDKs available, documented | Low |
|
| 730 |
+
| NFT-USAB-007 | Quick start guide | Getting started | Follow quick start | Working request in <10 minutes | High |
|
| 731 |
+
| NFT-USAB-008 | API versioning | Version management | Check version headers | Versions clearly indicated | Medium |
|
| 732 |
+
| NFT-USAB-009 | Changelog maintenance | Release notes | Review changelog | All changes documented | Low |
|
| 733 |
+
| NFT-USAB-010 | Deprecation notices | Breaking changes | Check deprecated features | Clear deprecation warnings | Medium |
|
| 734 |
+
|
| 735 |
+
---
|
| 736 |
+
|
| 737 |
+
### B.6 Compatibility Testing
|
| 738 |
+
|
| 739 |
+
Purpose: Verify system works across different environments.
|
| 740 |
+
|
| 741 |
+
#### **B.6.1 Browser Compatibility** (for API docs)
|
| 742 |
+
|
| 743 |
+
| Test Case ID | Browser | Version | Expected Outcome |
|
| 744 |
+
|--------------|---------|---------|------------------|
|
| 745 |
+
| NFT-COMPAT-001 | Chrome | Latest | /docs fully functional |
|
| 746 |
+
| NFT-COMPAT-002 | Firefox | Latest | /docs fully functional |
|
| 747 |
+
| NFT-COMPAT-003 | Safari | Latest | /docs fully functional |
|
| 748 |
+
| NFT-COMPAT-004 | Edge | Latest | /docs fully functional |
|
| 749 |
+
|
| 750 |
+
#### **B.6.2 Platform Compatibility**
|
| 751 |
+
|
| 752 |
+
| Test Case ID | Platform | Test Scenario | Expected Outcome | Priority |
|
| 753 |
+
|--------------|----------|---------------|------------------|----------|
|
| 754 |
+
| NFT-COMPAT-005 | Docker | Deploy in container | Runs without issues | Critical |
|
| 755 |
+
| NFT-COMPAT-006 | Railway | Deploy to Railway | Successful deployment | High |
|
| 756 |
+
| NFT-COMPAT-007 | AWS | Deploy to ECS/Lambda | Runs on AWS | Medium |
|
| 757 |
+
| NFT-COMPAT-008 | GCP | Deploy to Cloud Run | Runs on GCP | Low |
|
| 758 |
+
| NFT-COMPAT-009 | Azure | Deploy to App Service | Runs on Azure | Low |
|
| 759 |
+
|
| 760 |
+
#### **B.6.3 Python Version Compatibility**
|
| 761 |
+
|
| 762 |
+
| Test Case ID | Python Version | Test Scenario | Expected Outcome | Priority |
|
| 763 |
+
|--------------|----------------|---------------|------------------|----------|
|
| 764 |
+
| NFT-COMPAT-010 | Python 3.11 | Run full test suite | All tests pass | Critical |
|
| 765 |
+
| NFT-COMPAT-011 | Python 3.10 | Run full test suite | All tests pass | High |
|
| 766 |
+
| NFT-COMPAT-012 | Python 3.12 | Run full test suite | All tests pass | Medium |
|
| 767 |
+
|
| 768 |
+
---
|
| 769 |
+
|
| 770 |
+
### B.7 Maintainability Testing
|
| 771 |
+
|
| 772 |
+
Purpose: Verify system is easy to maintain and debug.
|
| 773 |
+
|
| 774 |
+
#### **B.7.1 Logging & Monitoring**
|
| 775 |
+
|
| 776 |
+
| Test Case ID | Test Name | Aspect | Test Scenario | Expected Outcome | Priority |
|
| 777 |
+
|--------------|-----------|--------|---------------|------------------|----------|
|
| 778 |
+
| NFT-MAINT-001 | Log completeness | Logging | Check logs during generation | All stages logged | High |
|
| 779 |
+
| NFT-MAINT-002 | Log levels | Log filtering | Filter by ERROR, INFO, DEBUG | Correct levels used | Medium |
|
| 780 |
+
| NFT-MAINT-003 | Structured logging | Log format | Parse log entries | JSON-formatted, parseable | High |
|
| 781 |
+
| NFT-MAINT-004 | Error traceability | Error tracking | Trace error through logs | Request ID tracks full flow | High |
|
| 782 |
+
| NFT-MAINT-005 | Metrics collection | Monitoring | Check Prometheus metrics | Key metrics exported | High |
|
| 783 |
+
| NFT-MAINT-006 | Health checks | Monitoring | Call /health endpoint | Returns detailed status | Critical |
|
| 784 |
+
| NFT-MAINT-007 | Alert configuration | Alerting | Trigger alert condition | Alert fired, notification sent | Medium |
|
| 785 |
+
| NFT-MAINT-008 | Dashboard usability | Visualization | View Grafana dashboards | Clear, actionable insights | Medium |
|
| 786 |
+
|
| 787 |
+
#### **B.7.2 Code Quality**
|
| 788 |
+
|
| 789 |
+
| Test Case ID | Test Name | Quality Metric | Tool | Acceptance Criteria | Priority |
|
| 790 |
+
|--------------|-----------|----------------|------|---------------------|----------|
|
| 791 |
+
| NFT-MAINT-009 | Code coverage | Test coverage | pytest-cov | >80% coverage | High |
|
| 792 |
+
| NFT-MAINT-010 | Code complexity | Cyclomatic complexity | radon | CC <10 per function | Medium |
|
| 793 |
+
| NFT-MAINT-011 | Code duplication | DRY principle | pylint | <5% duplicated code | Low |
|
| 794 |
+
| NFT-MAINT-012 | Code style | PEP 8 compliance | flake8 | No style violations | Medium |
|
| 795 |
+
| NFT-MAINT-013 | Type hints | Type coverage | mypy | >90% type hints | Medium |
|
| 796 |
+
| NFT-MAINT-014 | Security linting | Vulnerability scan | bandit | No high-severity issues | High |
|
| 797 |
+
|
| 798 |
+
---
|
| 799 |
+
|
| 800 |
+
## Test Environment Setup
|
| 801 |
+
|
| 802 |
+
### Test Environments
|
| 803 |
+
|
| 804 |
+
| Environment | Purpose | Configuration | Access |
|
| 805 |
+
|-------------|---------|---------------|--------|
|
| 806 |
+
| **Local Dev** | Development testing | Local Docker Compose | Developers |
|
| 807 |
+
| **CI/CD** | Automated testing | GitHub Actions runners | Automated |
|
| 808 |
+
| **Staging** | Pre-production testing | Mirrors production | QA team |
|
| 809 |
+
| **Production** | Live system | Full infrastructure | Ops team |
|
| 810 |
+
|
| 811 |
+
### Test Data Management
|
| 812 |
+
|
| 813 |
+
**Seed Image Dataset**:
|
| 814 |
+
- **Source**: Curated test set of 50 diverse seed images
|
| 815 |
+
- **Location**: `tests/fixtures/seed_images/`
|
| 816 |
+
- **Categories**: Invoice samples, receipt samples, form samples, letter samples
|
| 817 |
+
- **Licensing**: Public domain or test-licensed images
|
| 818 |
+
|
| 819 |
+
**Test Parameters**:
|
| 820 |
+
```yaml
|
| 821 |
+
# tests/fixtures/test_params.yaml
|
| 822 |
+
test_cases:
|
| 823 |
+
minimal:
|
| 824 |
+
language: "english"
|
| 825 |
+
doc_type: "invoice"
|
| 826 |
+
num_solutions: 1
|
| 827 |
+
enable_handwriting: false
|
| 828 |
+
enable_visual_elements: false
|
| 829 |
+
|
| 830 |
+
full_features:
|
| 831 |
+
language: "english"
|
| 832 |
+
doc_type: "medical_form"
|
| 833 |
+
num_solutions: 2
|
| 834 |
+
enable_handwriting: true
|
| 835 |
+
handwriting_ratio: 0.3
|
| 836 |
+
enable_visual_elements: true
|
| 837 |
+
visual_element_types: ["logo", "signature", "barcode"]
|
| 838 |
+
enable_ocr: true
|
| 839 |
+
enable_dataset_export: true
|
| 840 |
+
```
|
| 841 |
+
|
| 842 |
+
**Mock Services**:
|
| 843 |
+
- **Mock Claude API**: Returns predefined HTML responses for testing
|
| 844 |
+
- **Mock RunPod API**: Returns test handwriting images, simulates delays
|
| 845 |
+
- **Mock Supabase**: In-memory database for testing
|
| 846 |
+
|
| 847 |
+
---
|
| 848 |
+
|
| 849 |
+
## Testing Tools & Frameworks
|
| 850 |
+
|
| 851 |
+
### Test Frameworks
|
| 852 |
+
|
| 853 |
+
| Tool | Purpose | Usage |
|
| 854 |
+
|------|---------|-------|
|
| 855 |
+
| **pytest** | Unit & integration testing | `pytest tests/` |
|
| 856 |
+
| **pytest-asyncio** | Async test support | Async function testing |
|
| 857 |
+
| **pytest-cov** | Code coverage | `pytest --cov=api` |
|
| 858 |
+
| **httpx** | HTTP client testing | API request mocking |
|
| 859 |
+
| **respx** | HTTP mock library | Mock external APIs |
|
| 860 |
+
| **pytest-mock** | Mocking framework | Mock functions, classes |
|
| 861 |
+
| **Faker** | Test data generation | Generate realistic data |
|
| 862 |
+
|
| 863 |
+
### Load Testing Tools
|
| 864 |
+
|
| 865 |
+
| Tool | Purpose | Usage |
|
| 866 |
+
|------|---------|-------|
|
| 867 |
+
| **Locust** | Load & stress testing | `locust -f locustfile.py` |
|
| 868 |
+
| **Apache JMeter** | Performance testing | GUI-based test scenarios |
|
| 869 |
+
| **k6** | Cloud-native load testing | Scripted load tests |
|
| 870 |
+
|
| 871 |
+
### Security Testing Tools
|
| 872 |
+
|
| 873 |
+
| Tool | Purpose | Usage |
|
| 874 |
+
|------|---------|-------|
|
| 875 |
+
| **OWASP ZAP** | Security scanning | Automated vulnerability scan |
|
| 876 |
+
| **Burp Suite** | Penetration testing | Manual security testing |
|
| 877 |
+
| **pip-audit** | Dependency scanning | `pip-audit -r requirements.txt` |
|
| 878 |
+
| **Bandit** | Code security linting | `bandit -r api/` |
|
| 879 |
+
| **Trivy** | Container scanning | `trivy image docgenie-api:latest` |
|
| 880 |
+
|
| 881 |
+
### Monitoring & Observability
|
| 882 |
+
|
| 883 |
+
| Tool | Purpose | Usage |
|
| 884 |
+
|------|---------|-------|
|
| 885 |
+
| **Prometheus** | Metrics collection | Scrape /metrics endpoint |
|
| 886 |
+
| **Grafana** | Metrics visualization | Dashboard creation |
|
| 887 |
+
| **ELK Stack** | Log aggregation | Centralized logging |
|
| 888 |
+
| **Sentry** | Error tracking | Automatic error reporting |
|
| 889 |
+
|
| 890 |
+
---
|
| 891 |
+
|
| 892 |
+
## Test Execution Plan
|
| 893 |
+
|
| 894 |
+
### Phase 1: Unit Testing (Week 1-2)
|
| 895 |
+
**Objective**: Achieve 80%+ code coverage
|
| 896 |
+
|
| 897 |
+
**Tasks**:
|
| 898 |
+
1. Write unit tests for all utility functions (`api/utils.py`)
|
| 899 |
+
2. Test all pipeline stages individually (Stages 01-19)
|
| 900 |
+
3. Mock external dependencies (Claude API, RunPod, Supabase)
|
| 901 |
+
4. Achieve minimum 80% code coverage
|
| 902 |
+
5. Set up CI/CD pipeline for automated testing
|
| 903 |
+
|
| 904 |
+
**Deliverables**:
|
| 905 |
+
- 120+ unit test cases passing
|
| 906 |
+
- Coverage report >80%
|
| 907 |
+
- CI/CD pipeline configured
|
| 908 |
+
|
| 909 |
+
### Phase 2: Integration Testing (Week 3)
|
| 910 |
+
**Objective**: Verify component interactions
|
| 911 |
+
|
| 912 |
+
**Tasks**:
|
| 913 |
+
1. Test pipeline stage integrations (01-03, 03-05, 07-09, etc.)
|
| 914 |
+
2. Test external service integrations (Claude, RunPod, Supabase)
|
| 915 |
+
3. Test database operations (CRUD, transactions)
|
| 916 |
+
4. Test API endpoint workflows
|
| 917 |
+
5. Test background worker integration
|
| 918 |
+
|
| 919 |
+
**Deliverables**:
|
| 920 |
+
- 50+ integration test cases passing
|
| 921 |
+
- All critical workflows tested
|
| 922 |
+
- Service mocks validated
|
| 923 |
+
|
| 924 |
+
### Phase 3: System Testing (Week 4)
|
| 925 |
+
**Objective**: End-to-end workflow validation
|
| 926 |
+
|
| 927 |
+
**Tasks**:
|
| 928 |
+
1. Test complete generation workflows (minimal, full features)
|
| 929 |
+
2. Test error handling scenarios
|
| 930 |
+
3. Test async processing workflows
|
| 931 |
+
4. Test data quality and accuracy
|
| 932 |
+
5. Test performance benchmarks
|
| 933 |
+
|
| 934 |
+
**Deliverables**:
|
| 935 |
+
- 50+ system test cases passing
|
| 936 |
+
- All user journeys tested
|
| 937 |
+
- Performance baselines established
|
| 938 |
+
|
| 939 |
+
### Phase 4: Non-Functional Testing (Week 5-6)
|
| 940 |
+
**Objective**: Verify performance, security, reliability
|
| 941 |
+
|
| 942 |
+
**Tasks**:
|
| 943 |
+
1. **Performance**: Load, stress, endurance, scalability tests
|
| 944 |
+
2. **Security**: Penetration testing, vulnerability scanning
|
| 945 |
+
3. **Reliability**: Fault tolerance, recovery testing
|
| 946 |
+
4. **Usability**: Documentation review, DX testing
|
| 947 |
+
|
| 948 |
+
**Deliverables**:
|
| 949 |
+
- Load test report (normal, peak, sustained)
|
| 950 |
+
- Security audit report
|
| 951 |
+
- Reliability test report
|
| 952 |
+
- Performance benchmarks
|
| 953 |
+
|
| 954 |
+
### Phase 5: Regression Testing (Ongoing)
|
| 955 |
+
**Objective**: Prevent defect reintroduction
|
| 956 |
+
|
| 957 |
+
**Tasks**:
|
| 958 |
+
1. Run full test suite on every commit (CI/CD)
|
| 959 |
+
2. Add tests for every bug fix
|
| 960 |
+
3. Update tests for new features
|
| 961 |
+
4. Maintain >80% code coverage
|
| 962 |
+
|
| 963 |
+
**Frequency**: Continuous (automated on every PR/commit)
|
| 964 |
+
|
| 965 |
+
---
|
| 966 |
+
|
| 967 |
+
## Success Criteria & Metrics
|
| 968 |
+
|
| 969 |
+
### Test Completion Criteria
|
| 970 |
+
|
| 971 |
+
| Criteria | Target | Critical |
|
| 972 |
+
|----------|--------|----------|
|
| 973 |
+
| Unit test coverage | >80% | Yes |
|
| 974 |
+
| Integration tests passing | 100% | Yes |
|
| 975 |
+
| System tests passing | 100% | Yes |
|
| 976 |
+
| Load test: Normal load | 0% errors | Yes |
|
| 977 |
+
| Load test: Peak load | <5% errors | Yes |
|
| 978 |
+
| Security: Critical vulnerabilities | 0 | Yes |
|
| 979 |
+
| Security: High vulnerabilities | <5 | Yes |
|
| 980 |
+
| Performance: Basic generation | <60s | Yes |
|
| 981 |
+
| Performance: Handwriting generation | <300s | Yes |
|
| 982 |
+
| Uptime SLA | >99.5% | No |
|
| 983 |
+
|
| 984 |
+
### Quality Metrics
|
| 985 |
+
|
| 986 |
+
**Code Quality**:
|
| 987 |
+
- Code coverage: >80%
|
| 988 |
+
- Cyclomatic complexity: <10
|
| 989 |
+
- Code duplication: <5%
|
| 990 |
+
- Type hint coverage: >90%
|
| 991 |
+
|
| 992 |
+
**Performance**:
|
| 993 |
+
- API response time (P95): <500ms
|
| 994 |
+
- Document generation (minimal): <60s
|
| 995 |
+
- Document generation (with handwriting): <300s
|
| 996 |
+
- Throughput: >500 docs/hour
|
| 997 |
+
|
| 998 |
+
**Reliability**:
|
| 999 |
+
- Uptime: >99.5%
|
| 1000 |
+
- MTBF (Mean Time Between Failures): >720 hours (30 days)
|
| 1001 |
+
- MTTR (Mean Time To Recover): <30 minutes
|
| 1002 |
+
- Error rate: <1%
|
| 1003 |
+
|
| 1004 |
+
**Security**:
|
| 1005 |
+
- Zero critical vulnerabilities
|
| 1006 |
+
- <5 high-severity vulnerabilities
|
| 1007 |
+
- Dependency update cadence: <30 days behind
|
| 1008 |
+
|
| 1009 |
+
---
|
| 1010 |
+
|
| 1011 |
+
## Risk Assessment
|
| 1012 |
+
|
| 1013 |
+
### High-Risk Areas
|
| 1014 |
+
|
| 1015 |
+
| Component | Risk Level | Mitigation Strategy | Priority |
|
| 1016 |
+
|-----------|------------|---------------------|----------|
|
| 1017 |
+
| Claude API integration | **HIGH** | Retry logic, fallback prompts, rate limiting | Critical |
|
| 1018 |
+
| RunPod handwriting service | **HIGH** | Timeout handling, batch optimization, error raising | Critical |
|
| 1019 |
+
| PDF rendering (Playwright) | **MEDIUM** | Headless browser stability, resource limits | High |
|
| 1020 |
+
| OCR accuracy | **MEDIUM** | Multiple OCR engine options, confidence thresholds | High |
|
| 1021 |
+
| Async job processing | **MEDIUM** | Worker health checks, job retry mechanisms | High |
|
| 1022 |
+
| Database transactions | **MEDIUM** | ACID compliance, connection pooling | High |
|
| 1023 |
+
| File storage | **LOW** | Disk space monitoring, cleanup policies | Medium |
|
| 1024 |
+
|
| 1025 |
+
### Test Risk Mitigation
|
| 1026 |
+
|
| 1027 |
+
| Risk | Impact | Probability | Mitigation |
|
| 1028 |
+
|------|--------|-------------|------------|
|
| 1029 |
+
| External API unavailable during tests | High | Medium | Use mocks, record/replay mode |
|
| 1030 |
+
| Test data corruption | Medium | Low | Version control test fixtures |
|
| 1031 |
+
| Test environment instability | High | Medium | Docker isolation, reproducible builds |
|
| 1032 |
+
| Long test execution time | Low | High | Parallel execution, selective testing |
|
| 1033 |
+
| Flaky tests | Medium | Medium | Retry logic, better assertions |
|
| 1034 |
+
|
| 1035 |
+
---
|
| 1036 |
+
|
| 1037 |
+
## Test Reporting
|
| 1038 |
+
|
| 1039 |
+
### Test Reports
|
| 1040 |
+
|
| 1041 |
+
**Daily Reports** (Automated):
|
| 1042 |
+
- Test execution summary (pass/fail counts)
|
| 1043 |
+
- Code coverage trends
|
| 1044 |
+
- Failed test details
|
| 1045 |
+
- Performance benchmark comparison
|
| 1046 |
+
|
| 1047 |
+
**Weekly Reports** (Manual):
|
| 1048 |
+
- Test progress against plan
|
| 1049 |
+
- New defects discovered
|
| 1050 |
+
- Defect resolution rate
|
| 1051 |
+
- Risk updates
|
| 1052 |
+
|
| 1053 |
+
**Release Reports** (Per Release):
|
| 1054 |
+
- Complete test execution summary
|
| 1055 |
+
- All test case results
|
| 1056 |
+
- Performance test results
|
| 1057 |
+
- Security scan results
|
| 1058 |
+
- Known issues and limitations
|
| 1059 |
+
|
| 1060 |
+
### Defect Tracking
|
| 1061 |
+
|
| 1062 |
+
**Defect Workflow**:
|
| 1063 |
+
1. **Report**: Tester creates defect in issue tracker
|
| 1064 |
+
2. **Triage**: Team prioritizes defect (P0-Critical, P1-High, P2-Medium, P3-Low)
|
| 1065 |
+
3. **Assign**: Developer assigned to fix
|
| 1066 |
+
4. **Fix**: Developer implements fix
|
| 1067 |
+
5. **Verify**: Tester verifies fix
|
| 1068 |
+
6. **Close**: Defect closed, regression test added
|
| 1069 |
+
|
| 1070 |
+
**Defect Metrics**:
|
| 1071 |
+
- Defect discovery rate
|
| 1072 |
+
- Defect resolution rate
|
| 1073 |
+
- Defect escape rate (to production)
|
| 1074 |
+
- Mean time to resolve (MTTR)
|
| 1075 |
+
|
| 1076 |
+
---
|
| 1077 |
+
|
| 1078 |
+
## Continuous Improvement
|
| 1079 |
+
|
| 1080 |
+
### Test Optimization
|
| 1081 |
+
|
| 1082 |
+
**Quarterly Reviews**:
|
| 1083 |
+
- Review test coverage (identify gaps)
|
| 1084 |
+
- Remove obsolete tests
|
| 1085 |
+
- Update test data
|
| 1086 |
+
- Optimize test execution time
|
| 1087 |
+
- Review test environment stability
|
| 1088 |
+
|
| 1089 |
+
**Automation Goals**:
|
| 1090 |
+
- Automate 100% of unit tests
|
| 1091 |
+
- Automate 90% of integration tests
|
| 1092 |
+
- Automate 70% of system tests
|
| 1093 |
+
- Automate 50% of non-functional tests
|
| 1094 |
+
|
| 1095 |
+
---
|
| 1096 |
+
|
| 1097 |
+
## Appendix
|
| 1098 |
+
|
| 1099 |
+
### Test Case Template
|
| 1100 |
+
|
| 1101 |
+
```markdown
|
| 1102 |
+
## Test Case ID: [ID]
|
| 1103 |
+
|
| 1104 |
+
**Test Name**: [Descriptive name]
|
| 1105 |
+
|
| 1106 |
+
**Component**: [Module/Component under test]
|
| 1107 |
+
|
| 1108 |
+
**Test Type**: [Unit/Integration/System/Non-Functional]
|
| 1109 |
+
|
| 1110 |
+
**Priority**: [Critical/High/Medium/Low]
|
| 1111 |
+
|
| 1112 |
+
**Prerequisites**:
|
| 1113 |
+
- [List any setup required]
|
| 1114 |
+
|
| 1115 |
+
**Test Steps**:
|
| 1116 |
+
1. [Step 1]
|
| 1117 |
+
2. [Step 2]
|
| 1118 |
+
3. [Step 3]
|
| 1119 |
+
|
| 1120 |
+
**Test Data**:
|
| 1121 |
+
- [Input data required]
|
| 1122 |
+
|
| 1123 |
+
**Expected Result**:
|
| 1124 |
+
- [What should happen]
|
| 1125 |
+
|
| 1126 |
+
**Actual Result**:
|
| 1127 |
+
- [What actually happened - filled during execution]
|
| 1128 |
+
|
| 1129 |
+
**Status**: [Pass/Fail/Blocked/Not Run]
|
| 1130 |
+
|
| 1131 |
+
**Notes**:
|
| 1132 |
+
- [Any additional observations]
|
| 1133 |
+
```
|
| 1134 |
+
|
| 1135 |
+
### Glossary
|
| 1136 |
+
|
| 1137 |
+
- **API**: Application Programming Interface
|
| 1138 |
+
- **CI/CD**: Continuous Integration/Continuous Deployment
|
| 1139 |
+
- **DPI**: Dots Per Inch
|
| 1140 |
+
- **GT**: Ground Truth
|
| 1141 |
+
- **HW**: Handwriting
|
| 1142 |
+
- **KIE**: Key Information Extraction
|
| 1143 |
+
- **LLM**: Large Language Model
|
| 1144 |
+
- **MTBF**: Mean Time Between Failures
|
| 1145 |
+
- **MTTR**: Mean Time To Recover
|
| 1146 |
+
- **OCR**: Optical Character Recognition
|
| 1147 |
+
- **P95**: 95th Percentile
|
| 1148 |
+
- **SLA**: Service Level Agreement
|
| 1149 |
+
- **VE**: Visual Element
|
| 1150 |
+
|
| 1151 |
+
---
|
| 1152 |
+
|
| 1153 |
+
**Document Control**:
|
| 1154 |
+
- **Author**: DocGenie QA Team
|
| 1155 |
+
- **Reviewers**: Development Team, Product Manager
|
| 1156 |
+
- **Approval**: Project Lead
|
| 1157 |
+
- **Next Review Date**: [3 months from approval]
|
| 1158 |
+
|
| 1159 |
+
---
|
| 1160 |
+
|
| 1161 |
+
**END OF DOCUMENT**
|
api/README.md
ADDED
|
@@ -0,0 +1,1220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DocGenie API
|
| 2 |
+
|
| 3 |
+
FastAPI-based REST API for generating synthetic documents using LLMs. This API is **optimized for ML dataset creation** with comprehensive handwriting and visual element support.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- 🚀 **Simple REST API** - Easy to integrate with any frontend
|
| 8 |
+
- 🖼️ **URL-based seed images** - Provide seed images via URLs
|
| 9 |
+
- 🎨 **Customizable prompts** - Control document type, language, and ground truth format
|
| 10 |
+
- ✍️ **Handwriting Generation** - WordStylist diffusion model with 339 author styles
|
| 11 |
+
- 🎯 **Visual Elements** - Stamps, logos, barcodes, photos, figures
|
| 12 |
+
- 📊 **ML-Ready Datasets** - Individual token images with complete metadata
|
| 13 |
+
- 📄 **Complete output** - Returns PDF, HTML, CSS, and bounding boxes
|
| 14 |
+
- ⚡ **Async processing** - Fast and efficient document generation
|
| 15 |
+
|
| 16 |
+
## ML Dataset Creation
|
| 17 |
+
|
| 18 |
+
The API is **fully equipped for ML training dataset creation** with `output_detail: "dataset"` mode:
|
| 19 |
+
|
| 20 |
+
### ✅ Handwriting Data
|
| 21 |
+
- **Individual token images**: Each handwriting field saved as separate PNG (`hw0.png`, `hw1.png`, ...)
|
| 22 |
+
- **Author style IDs**: 339 unique writer styles (0-338) for style-consistent generation
|
| 23 |
+
- **Text content**: Original text for each handwriting field
|
| 24 |
+
- **Position data**: Precise bounding boxes (x, y, width, height) in mm
|
| 25 |
+
- **Signature detection**: Boolean flag for signature vs regular handwriting
|
| 26 |
+
- **Image dimensions**: Width and height for each generated token
|
| 27 |
+
|
| 28 |
+
### ✅ Visual Element Data
|
| 29 |
+
- **Stamps**: Generated with realistic textures, borders, and rotations
|
| 30 |
+
- Text content preserved
|
| 31 |
+
- Red/green color variants
|
| 32 |
+
- Circle/rectangle shapes
|
| 33 |
+
- **Logos**: Random selection from 6+ logo prefabs
|
| 34 |
+
- **Barcodes**: Code128 format with customizable content
|
| 35 |
+
- **Photos**: Random selection from 5+ photo prefabs
|
| 36 |
+
- **Figures/Charts**: Random selection from 6+ chart/diagram prefabs
|
| 37 |
+
- **Individual images**: Each element saved as separate PNG with transparency
|
| 38 |
+
|
| 39 |
+
### ✅ Dataset Metadata
|
| 40 |
+
- **Token mapping JSON**: Complete mapping with:
|
| 41 |
+
- Token IDs and references
|
| 42 |
+
- Style IDs for handwriting
|
| 43 |
+
- Element types for visual elements
|
| 44 |
+
- Position rectangles
|
| 45 |
+
- Image filenames
|
| 46 |
+
- Content text
|
| 47 |
+
- **Ground truth annotations**: QA pairs, classification labels, NER tags
|
| 48 |
+
- **Bounding boxes**: Word, segment, and layout-level bboxes
|
| 49 |
+
- **Normalized coordinates**: [0,1] scaled for ML frameworks
|
| 50 |
+
- **Msgpack export**: Compatible with datadings library
|
| 51 |
+
|
| 52 |
+
### ✅ Additional ML Features
|
| 53 |
+
- **OCR results**: Word-level bboxes and text for Document AI training
|
| 54 |
+
- **Layout elements**: Document structure annotations
|
| 55 |
+
- **Page dimensions**: Physical measurements (mm) and pixel dimensions
|
| 56 |
+
- **Reproducibility**: Seed-based generation for consistent results
|
| 57 |
+
|
| 58 |
+
## Pipeline Overview
|
| 59 |
+
|
| 60 |
+
The API implements a simplified version of the DocGenie generation pipeline:
|
| 61 |
+
|
| 62 |
+
1. **Download seed images** from URLs
|
| 63 |
+
2. **Convert to base64** for LLM input
|
| 64 |
+
3. **Build custom prompt** with user parameters
|
| 65 |
+
4. **Call Claude API** to generate HTML documents
|
| 66 |
+
5. **Extract HTML/CSS** and ground truth from response
|
| 67 |
+
6. **Render to PDF** using Playwright
|
| 68 |
+
7. **Extract bounding boxes** from PDF
|
| 69 |
+
8. **Return results** as JSON with base64-encoded PDF
|
| 70 |
+
|
| 71 |
+
## Installation
|
| 72 |
+
|
| 73 |
+
### Prerequisites
|
| 74 |
+
|
| 75 |
+
- Python 3.10+
|
| 76 |
+
- DocGenie main package installed
|
| 77 |
+
- Playwright browsers installed
|
| 78 |
+
|
| 79 |
+
### Setup
|
| 80 |
+
|
| 81 |
+
1. Install dependencies (all API dependencies are included in the main project):
|
| 82 |
+
```bash
|
| 83 |
+
# Using uv (recommended)
|
| 84 |
+
uv sync
|
| 85 |
+
|
| 86 |
+
# Or using pip
|
| 87 |
+
pip install -e .
|
| 88 |
+
|
| 89 |
+
# Or install API-specific dependencies
|
| 90 |
+
cd api/
|
| 91 |
+
pip install -r requirements.txt
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Note**: For async endpoint support, ensure you have:
|
| 95 |
+
- `redis>=5.0.0` and `rq>=1.15.0` (job queue)
|
| 96 |
+
- `supabase>=2.0.0` (database)
|
| 97 |
+
- `google-api-python-client>=2.100.0` (Google Drive integration)
|
| 98 |
+
|
| 99 |
+
2. Install Playwright browsers:
|
| 100 |
+
```bash
|
| 101 |
+
playwright install chromium
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
3. Install Tesseract OCR (for local OCR support):
|
| 105 |
+
```bash
|
| 106 |
+
# Ubuntu/Debian
|
| 107 |
+
sudo apt-get update && sudo apt-get install tesseract-ocr
|
| 108 |
+
|
| 109 |
+
# macOS
|
| 110 |
+
brew install tesseract
|
| 111 |
+
|
| 112 |
+
# Windows
|
| 113 |
+
# Download installer from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
4. Set your Anthropic API key:
|
| 117 |
+
```bash
|
| 118 |
+
export ANTHROPIC_API_KEY="your-api-key-here"
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
5. Configure OCR in `.env`:
|
| 122 |
+
```bash
|
| 123 |
+
cp .env.example .env
|
| 124 |
+
# Edit .env and set:
|
| 125 |
+
OCR_SERVICE_ENABLED=true
|
| 126 |
+
OCR_USE_LOCAL=true # Use local Tesseract (recommended)
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## Running the API
|
| 130 |
+
|
| 131 |
+
### Development Mode
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
cd api
|
| 135 |
+
python main.py
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
The API will be available at `http://localhost:8000`
|
| 139 |
+
|
| 140 |
+
### Production Mode
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
cd api
|
| 144 |
+
uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
## API Endpoints
|
| 148 |
+
|
| 149 |
+
### Health Check
|
| 150 |
+
|
| 151 |
+
```http
|
| 152 |
+
GET /health
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
**Response:**
|
| 156 |
+
```json
|
| 157 |
+
{
|
| 158 |
+
"status": "healthy",
|
| 159 |
+
"version": "1.0.0"
|
| 160 |
+
}
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
### Generate Documents
|
| 164 |
+
|
| 165 |
+
```http
|
| 166 |
+
POST /generate
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
**Request Body:**
|
| 170 |
+
```json
|
| 171 |
+
{
|
| 172 |
+
"seed_images": [
|
| 173 |
+
"https://example.com/seed1.jpg",
|
| 174 |
+
"https://example.com/seed2.jpg"
|
| 175 |
+
],
|
| 176 |
+
"prompt_params": {
|
| 177 |
+
"language": "English",
|
| 178 |
+
"doc_type": "business and administrative",
|
| 179 |
+
"gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
|
| 180 |
+
"gt_format": "{\"<Text of question 1>\": \"<Answer to question 1>\", \"<Text of question 2>\": \"<Answer to question 2>\", ...}",
|
| 181 |
+
"num_solutions": 3
|
| 182 |
+
},
|
| 183 |
+
"model": "claude-sonnet-4-5-20250929",
|
| 184 |
+
"api_key": "optional-api-key"
|
| 185 |
+
}
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
**Response:**
|
| 189 |
+
```json
|
| 190 |
+
{
|
| 191 |
+
"success": true,
|
| 192 |
+
"message": "Successfully generated 3 documents",
|
| 193 |
+
"total_documents": 3,
|
| 194 |
+
"documents": [
|
| 195 |
+
{
|
| 196 |
+
"document_id": "uuid-123_0",
|
| 197 |
+
"html": "<!DOCTYPE html>...",
|
| 198 |
+
"css": "body { ... }",
|
| 199 |
+
"ground_truth": {
|
| 200 |
+
"What is the invoice number?": "INV-12345",
|
| 201 |
+
"What is the total amount?": "$1,234.56"
|
| 202 |
+
},
|
| 203 |
+
"pdf_base64": "JVBERi0xLjQK...",
|
| 204 |
+
"bboxes": [
|
| 205 |
+
{
|
| 206 |
+
"text": "Invoice",
|
| 207 |
+
"x": 0.1,
|
| 208 |
+
"y": 0.05,
|
| 209 |
+
"width": 0.2,
|
| 210 |
+
"height": 0.03,
|
| 211 |
+
"page": 0
|
| 212 |
+
}
|
| 213 |
+
],
|
| 214 |
+
"page_width_mm": 210.0,
|
| 215 |
+
"page_height_mm": 297.0
|
| 216 |
+
}
|
| 217 |
+
]
|
| 218 |
+
}
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
### Generate Documents (Async) - **Recommended for Production**
|
| 222 |
+
|
| 223 |
+
```http
|
| 224 |
+
POST /generate/async
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
**🎯 Cost Optimization**: This endpoint uses Claude's **Batch API** for **50% cost savings** ($2.50 vs $5.00 per 1M input tokens).
|
| 228 |
+
|
| 229 |
+
**⏱️ Latency**: 5-30 minutes (vs 30-120 seconds for direct API)
|
| 230 |
+
|
| 231 |
+
**✅ Best For**: Multi-user production systems with non-realtime requirements
|
| 232 |
+
|
| 233 |
+
**Request Body:**
|
| 234 |
+
```json
|
| 235 |
+
{
|
| 236 |
+
"user_id": 123,
|
| 237 |
+
"seed_images": [
|
| 238 |
+
"https://example.com/seed1.jpg",
|
| 239 |
+
"https://example.com/seed2.jpg"
|
| 240 |
+
],
|
| 241 |
+
"prompt_params": {
|
| 242 |
+
"language": "English",
|
| 243 |
+
"doc_type": "business and administrative",
|
| 244 |
+
"num_solutions": 3,
|
| 245 |
+
"enable_handwriting": true,
|
| 246 |
+
"enable_visual_elements": true,
|
| 247 |
+
"enable_ocr": true,
|
| 248 |
+
"output_detail": "dataset"
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
```
|
| 252 |
+
|
| 253 |
+
**Response:**
|
| 254 |
+
```json
|
| 255 |
+
{
|
| 256 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 257 |
+
"status": "queued",
|
| 258 |
+
"estimated_time_minutes": 10,
|
| 259 |
+
"poll_url": "/jobs/550e8400-e29b-41d4-a716-446655440000/status",
|
| 260 |
+
"created_at": "2025-01-15T12:00:00Z"
|
| 261 |
+
}
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
**Workflow:**
|
| 265 |
+
1. Submit generation request → Get `request_id`
|
| 266 |
+
2. Poll status endpoint every 30-60 seconds
|
| 267 |
+
3. When `status: "completed"`, download from Google Drive
|
| 268 |
+
4. Results uploaded to user's Google Drive with shareable link
|
| 269 |
+
|
| 270 |
+
### Check Job Status
|
| 271 |
+
|
| 272 |
+
```http
|
| 273 |
+
GET /jobs/{request_id}/status
|
| 274 |
+
```
|
| 275 |
+
|
| 276 |
+
**Response (Queued):**
|
| 277 |
+
```json
|
| 278 |
+
{
|
| 279 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 280 |
+
"status": "queued",
|
| 281 |
+
"created_at": "2025-01-15T12:00:00Z",
|
| 282 |
+
"updated_at": "2025-01-15T12:00:00Z"
|
| 283 |
+
}
|
| 284 |
+
```
|
| 285 |
+
|
| 286 |
+
**Response (Processing):**
|
| 287 |
+
```json
|
| 288 |
+
{
|
| 289 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 290 |
+
"status": "processing",
|
| 291 |
+
"created_at": "2025-01-15T12:00:00Z",
|
| 292 |
+
"updated_at": "2025-01-15T12:05:00Z",
|
| 293 |
+
"progress": "Creating batch request..."
|
| 294 |
+
}
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
**Response (Completed):**
|
| 298 |
+
```json
|
| 299 |
+
{
|
| 300 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 301 |
+
"status": "completed",
|
| 302 |
+
"created_at": "2025-01-15T12:00:00Z",
|
| 303 |
+
"updated_at": "2025-01-15T12:15:00Z",
|
| 304 |
+
"download_url": "https://drive.google.com/file/d/abc123xyz/view?usp=sharing",
|
| 305 |
+
"file_size_mb": 15.4,
|
| 306 |
+
"document_count": 3
|
| 307 |
+
}
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
**Response (Failed):**
|
| 311 |
+
```json
|
| 312 |
+
{
|
| 313 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 314 |
+
"status": "failed",
|
| 315 |
+
"created_at": "2025-01-15T12:00:00Z",
|
| 316 |
+
"updated_at": "2025-01-15T12:08:00Z",
|
| 317 |
+
"error_message": "Batch processing timeout"
|
| 318 |
+
}
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
**Status Values:**
|
| 322 |
+
- `queued`: Job submitted, waiting for worker
|
| 323 |
+
- `processing`: Worker picked up job, creating batch
|
| 324 |
+
- `generating`: Batch submitted to Claude, waiting for completion
|
| 325 |
+
- `completed`: Documents generated and uploaded to Google Drive
|
| 326 |
+
- `failed`: Error occurred (see `error_message`)
|
| 327 |
+
|
| 328 |
+
### List User Jobs
|
| 329 |
+
|
| 330 |
+
```http
|
| 331 |
+
GET /jobs/user/{user_id}?limit=50&offset=0
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
**Response:**
|
| 335 |
+
```json
|
| 336 |
+
{
|
| 337 |
+
"user_id": 123,
|
| 338 |
+
"jobs": [
|
| 339 |
+
{
|
| 340 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 341 |
+
"status": "completed",
|
| 342 |
+
"created_at": "2025-01-15T12:00:00Z",
|
| 343 |
+
"download_url": "https://drive.google.com/...",
|
| 344 |
+
"document_count": 3
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"request_id": "660e8400-e29b-41d4-a716-446655440111",
|
| 348 |
+
"status": "processing",
|
| 349 |
+
"created_at": "2025-01-15T12:30:00Z"
|
| 350 |
+
}
|
| 351 |
+
],
|
| 352 |
+
"count": 2,
|
| 353 |
+
"limit": 50,
|
| 354 |
+
"offset": 0
|
| 355 |
+
}
|
| 356 |
+
```
|
| 357 |
+
|
| 358 |
+
## Usage Examples
|
| 359 |
+
|
| 360 |
+
### cURL
|
| 361 |
+
|
| 362 |
+
```bash
|
| 363 |
+
curl -X POST http://localhost:8000/generate \
|
| 364 |
+
-H "Content-Type: application/json" \
|
| 365 |
+
-d '{
|
| 366 |
+
"seed_images": [
|
| 367 |
+
"https://example.com/receipt1.jpg",
|
| 368 |
+
"https://example.com/receipt2.jpg"
|
| 369 |
+
],
|
| 370 |
+
"prompt_params": {
|
| 371 |
+
"language": "English",
|
| 372 |
+
"doc_type": "receipts",
|
| 373 |
+
"num_solutions": 2
|
| 374 |
+
}
|
| 375 |
+
}'
|
| 376 |
+
```
|
| 377 |
+
|
| 378 |
+
### Python (Direct API)
|
| 379 |
+
|
| 380 |
+
```python
|
| 381 |
+
import requests
|
| 382 |
+
import base64
|
| 383 |
+
|
| 384 |
+
response = requests.post(
|
| 385 |
+
"http://localhost:8000/generate",
|
| 386 |
+
json={
|
| 387 |
+
"seed_images": [
|
| 388 |
+
"https://example.com/seed1.jpg",
|
| 389 |
+
"https://example.com/seed2.jpg"
|
| 390 |
+
],
|
| 391 |
+
"prompt_params": {
|
| 392 |
+
"language": "English",
|
| 393 |
+
"doc_type": "business forms",
|
| 394 |
+
"num_solutions": 3
|
| 395 |
+
}
|
| 396 |
+
}
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
result = response.json()
|
| 400 |
+
|
| 401 |
+
# Save first PDF
|
| 402 |
+
if result["success"]:
|
| 403 |
+
pdf_data = base64.b64decode(result["documents"][0]["pdf_base64"])
|
| 404 |
+
with open("generated_doc.pdf", "wb") as f:
|
| 405 |
+
f.write(pdf_data)
|
| 406 |
+
```
|
| 407 |
+
|
| 408 |
+
### Python (Async API with Polling) - **Recommended**
|
| 409 |
+
|
| 410 |
+
```python
|
| 411 |
+
import requests
|
| 412 |
+
import time
|
| 413 |
+
|
| 414 |
+
# Step 1: Submit job
|
| 415 |
+
response = requests.post(
|
| 416 |
+
"http://localhost:8000/generate/async",
|
| 417 |
+
json={
|
| 418 |
+
"user_id": 123,
|
| 419 |
+
"seed_images": [
|
| 420 |
+
"https://example.com/seed1.jpg",
|
| 421 |
+
"https://example.com/seed2.jpg"
|
| 422 |
+
],
|
| 423 |
+
"prompt_params": {
|
| 424 |
+
"language": "English",
|
| 425 |
+
"doc_type": "receipts and invoices",
|
| 426 |
+
"num_solutions": 5,
|
| 427 |
+
"enable_handwriting": True,
|
| 428 |
+
"enable_visual_elements": True,
|
| 429 |
+
"enable_ocr": True,
|
| 430 |
+
"output_detail": "dataset"
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
job = response.json()
|
| 436 |
+
request_id = job["request_id"]
|
| 437 |
+
print(f"✓ Job submitted: {request_id}")
|
| 438 |
+
print(f" Estimated time: {job['estimated_time_minutes']} minutes")
|
| 439 |
+
|
| 440 |
+
# Step 2: Poll status until complete
|
| 441 |
+
while True:
|
| 442 |
+
status_response = requests.get(
|
| 443 |
+
f"http://localhost:8000/jobs/{request_id}/status"
|
| 444 |
+
)
|
| 445 |
+
status = status_response.json()
|
| 446 |
+
|
| 447 |
+
print(f" Status: {status['status']}", end="")
|
| 448 |
+
if status.get("progress"):
|
| 449 |
+
print(f" - {status['progress']}")
|
| 450 |
+
else:
|
| 451 |
+
print()
|
| 452 |
+
|
| 453 |
+
if status["status"] == "completed":
|
| 454 |
+
print(f"✓ Generation complete!")
|
| 455 |
+
print(f" Download: {status['download_url']}")
|
| 456 |
+
print(f" Size: {status.get('file_size_mb', 0):.1f} MB")
|
| 457 |
+
print(f" Documents: {status.get('document_count', 0)}")
|
| 458 |
+
break
|
| 459 |
+
elif status["status"] == "failed":
|
| 460 |
+
print(f"✗ Generation failed: {status.get('error_message')}")
|
| 461 |
+
break
|
| 462 |
+
|
| 463 |
+
# Wait 30 seconds before next poll
|
| 464 |
+
time.sleep(30)
|
| 465 |
+
|
| 466 |
+
# Step 3: Download from Google Drive (if completed)
|
| 467 |
+
if status["status"] == "completed":
|
| 468 |
+
# User can download from their Google Drive using the shareable link
|
| 469 |
+
print(f"\nDownload your documents at:\n{status['download_url']}")
|
| 470 |
+
```
|
| 471 |
+
|
| 472 |
+
### JavaScript
|
| 473 |
+
|
| 474 |
+
```javascript
|
| 475 |
+
const response = await fetch('http://localhost:8000/generate', {
|
| 476 |
+
method: 'POST',
|
| 477 |
+
headers: {
|
| 478 |
+
'Content-Type': 'application/json',
|
| 479 |
+
},
|
| 480 |
+
body: JSON.stringify({
|
| 481 |
+
seed_images: [
|
| 482 |
+
'https://example.com/seed1.jpg',
|
| 483 |
+
'https://example.com/seed2.jpg'
|
| 484 |
+
],
|
| 485 |
+
prompt_params: {
|
| 486 |
+
language: 'English',
|
| 487 |
+
doc_type: 'invoices',
|
| 488 |
+
num_solutions: 2
|
| 489 |
+
}
|
| 490 |
+
})
|
| 491 |
+
});
|
| 492 |
+
|
| 493 |
+
const result = await response.json();
|
| 494 |
+
|
| 495 |
+
// Convert base64 PDF to blob
|
| 496 |
+
const pdfBlob = await fetch(`data:application/pdf;base64,${result.documents[0].pdf_base64}`)
|
| 497 |
+
.then(res => res.blob());
|
| 498 |
+
```
|
| 499 |
+
|
| 500 |
+
## Configuration
|
| 501 |
+
|
| 502 |
+
### Prompt Parameters
|
| 503 |
+
|
| 504 |
+
- **language**: Language for generated documents (default: "English")
|
| 505 |
+
- **doc_type**: Type of documents to generate (e.g., "business and administrative", "receipts", "forms")
|
| 506 |
+
- **gt_type**: Description of ground truth type to generate
|
| 507 |
+
- **gt_format**: Format specification for ground truth JSON
|
| 508 |
+
- **num_solutions**: Number of document variations (1-5)
|
| 509 |
+
|
| 510 |
+
### Stage 3-5 Advanced Features
|
| 511 |
+
|
| 512 |
+
The API supports advanced document synthesis and dataset packaging:
|
| 513 |
+
|
| 514 |
+
#### Stage 3: Handwriting & Visual Elements
|
| 515 |
+
- **enable_handwriting**: Add handwritten text using diffusion model (default: false)
|
| 516 |
+
- **handwriting_ratio**: Percentage of text to convert to handwriting 0-1 (default: 0.5)
|
| 517 |
+
- **enable_visual_elements**: Add stamps, barcodes, logos (default: false)
|
| 518 |
+
- **visual_element_types**: Types of elements to add: ["stamp", "logo", "figure", "barcode", "photo"] (default: all types)
|
| 519 |
+
|
| 520 |
+
#### Stage 4: OCR
|
| 521 |
+
- **enable_ocr**: Perform OCR on generated document (default: false)
|
| 522 |
+
- **ocr_language**: OCR language code (default: "en")
|
| 523 |
+
|
| 524 |
+
#### Stage 5: Dataset Packaging
|
| 525 |
+
- **enable_bbox_normalization**: Normalize bboxes to [0,1] scale (default: false)
|
| 526 |
+
- **enable_gt_verification**: Verify ground truth quality (default: false)
|
| 527 |
+
- **enable_analysis**: Generate dataset statistics (default: false)
|
| 528 |
+
- **enable_debug_visualization**: Create bbox overlay images (default: false)
|
| 529 |
+
|
| 530 |
+
#### Dataset Export (Msgpack Format)
|
| 531 |
+
- **enable_dataset_export**: Export as msgpack dataset format (default: false)
|
| 532 |
+
- **dataset_export_format**: Export format - only "msgpack" is supported (default: "msgpack")
|
| 533 |
+
|
| 534 |
+
**Note**: Only msgpack format is implemented in the current pipeline. COCO and HuggingFace export formats mentioned in some documentation are not yet available.
|
| 535 |
+
|
| 536 |
+
#### Output Detail Level
|
| 537 |
+
- **output_detail**: Controls how much data is returned/saved (default: "minimal")
|
| 538 |
+
- `"minimal"` (default): Final outputs only (PDFs, images, metadata) - 2-5 MB per document
|
| 539 |
+
- `"dataset"`: Includes individual token images for ML training - 10-20 MB per document
|
| 540 |
+
- Individual handwriting token images (`handwriting_tokens/hw0.png`, ...)
|
| 541 |
+
- Individual visual element images (`visual_elements/logo_0.png`, ...)
|
| 542 |
+
- Token mapping JSON with style IDs and positions
|
| 543 |
+
- `"complete"`: All intermediate files and debug info - 20-50 MB per document
|
| 544 |
+
- Everything from `dataset` mode
|
| 545 |
+
- Intermediate PDFs from each processing stage
|
| 546 |
+
- Generation logs
|
| 547 |
+
- ⚠️ **Warning**: Can result in 50+ MB JSON responses for `/generate` endpoint
|
| 548 |
+
|
| 549 |
+
**Recommendation**: Use `"minimal"` for production, `"dataset"` for ML research, `"complete"` for debugging (only with `/generate/pdf`).
|
| 550 |
+
|
| 551 |
+
**Example with dataset output detail:**
|
| 552 |
+
```python
|
| 553 |
+
import requests
|
| 554 |
+
import base64
|
| 555 |
+
import json
|
| 556 |
+
|
| 557 |
+
# Generate ML training dataset
|
| 558 |
+
response = requests.post(
|
| 559 |
+
"http://localhost:8000/generate",
|
| 560 |
+
json={
|
| 561 |
+
"seed_images": ["https://example.com/seed.jpg"],
|
| 562 |
+
"prompt_params": {
|
| 563 |
+
"language": "English",
|
| 564 |
+
"doc_type": "receipts and invoices",
|
| 565 |
+
"num_solutions": 5,
|
| 566 |
+
|
| 567 |
+
# Enable handwriting and visual elements
|
| 568 |
+
"enable_handwriting": True,
|
| 569 |
+
"handwriting_ratio": 0.4,
|
| 570 |
+
"enable_visual_elements": True,
|
| 571 |
+
"visual_element_types": ["stamp", "logo", "figure", "barcode", "photo"], # All types by default
|
| 572 |
+
|
| 573 |
+
# Enable dataset features
|
| 574 |
+
"enable_ocr": True,
|
| 575 |
+
"enable_bbox_normalization": True,
|
| 576 |
+
"enable_dataset_export": True,
|
| 577 |
+
|
| 578 |
+
# IMPORTANT: Set output_detail to "dataset" for ML training
|
| 579 |
+
"output_detail": "dataset",
|
| 580 |
+
|
| 581 |
+
# Use seed for reproducibility
|
| 582 |
+
"seed": 42
|
| 583 |
+
}
|
| 584 |
+
}
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
result = response.json()
|
| 588 |
+
|
| 589 |
+
# Process each generated document
|
| 590 |
+
for doc in result["documents"]:
|
| 591 |
+
doc_id = doc["document_id"]
|
| 592 |
+
print(f"\\nProcessing {doc_id}:")
|
| 593 |
+
|
| 594 |
+
# 1. Save individual handwriting token images
|
| 595 |
+
if doc.get("handwriting_token_images"):
|
| 596 |
+
print(f" - Handwriting tokens: {len(doc['handwriting_token_images'])}")
|
| 597 |
+
for hw_id, img_b64 in doc["handwriting_token_images"].items():
|
| 598 |
+
with open(f"dataset/{doc_id}/{hw_id}.png", "wb") as f:
|
| 599 |
+
f.write(base64.b64decode(img_b64))
|
| 600 |
+
|
| 601 |
+
# 2. Save individual visual element images
|
| 602 |
+
if doc.get("visual_element_images"):
|
| 603 |
+
print(f" - Visual elements: {len(doc['visual_element_images'])}")
|
| 604 |
+
for ve_id, img_b64 in doc["visual_element_images"].items():
|
| 605 |
+
with open(f"dataset/{doc_id}/{ve_id}.png", "wb") as f:
|
| 606 |
+
f.write(base64.b64decode(img_b64))
|
| 607 |
+
|
| 608 |
+
# 3. Save token mapping for ML training
|
| 609 |
+
if doc.get("token_mapping"):
|
| 610 |
+
mapping = doc["token_mapping"]
|
| 611 |
+
print(f" - Mapping: {mapping['handwriting']['total_count']} HW + {mapping['visual_elements']['total_count']} VE")
|
| 612 |
+
with open(f"dataset/{doc_id}/token_mapping.json", "w") as f:
|
| 613 |
+
json.dump(mapping, f, indent=2)
|
| 614 |
+
|
| 615 |
+
# 4. Save ground truth annotations
|
| 616 |
+
if doc.get("ground_truth"):
|
| 617 |
+
with open(f"dataset/{doc_id}/ground_truth.json", "w") as f:
|
| 618 |
+
json.dump(doc["ground_truth"], f, indent=2)
|
| 619 |
+
|
| 620 |
+
# 5. Save bounding boxes (normalized coordinates)
|
| 621 |
+
if doc.get("normalized_bboxes_word"):
|
| 622 |
+
with open(f"dataset/{doc_id}/bboxes_normalized.json", "w") as f:
|
| 623 |
+
json.dump(doc["normalized_bboxes_word"], f, indent=2)
|
| 624 |
+
|
| 625 |
+
# 6. Save final document image
|
| 626 |
+
if doc.get("image_base64"):
|
| 627 |
+
with open(f"dataset/{doc_id}/final_image.png", "wb") as f:
|
| 628 |
+
f.write(base64.b64decode(doc["image_base64"]))
|
| 629 |
+
|
| 630 |
+
# 7. Save msgpack dataset file
|
| 631 |
+
if doc.get("dataset_export") and doc["dataset_export"].get("msgpack_base64"):
|
| 632 |
+
with open(f"dataset/{doc_id}/dataset.msgpack", "wb") as f:
|
| 633 |
+
f.write(base64.b64decode(doc["dataset_export"]["msgpack_base64"]))
|
| 634 |
+
|
| 635 |
+
print(f"\\n✅ Generated {len(result['documents'])} ML-ready documents")
|
| 636 |
+
```
|
| 637 |
+
|
| 638 |
+
### PDF Generation Endpoint (Recommended for Large Datasets)
|
| 639 |
+
|
| 640 |
+
For bulk generation with comprehensive file outputs, use `/generate/pdf`:
|
| 641 |
+
|
| 642 |
+
```bash
|
| 643 |
+
curl -X POST http://localhost:8000/generate/pdf \
|
| 644 |
+
-H "Content-Type: application/json" \
|
| 645 |
+
-d '{
|
| 646 |
+
"seed_images": ["https://example.com/seed1.jpg"],
|
| 647 |
+
"prompt_params": {
|
| 648 |
+
"num_solutions": 3,
|
| 649 |
+
"enable_handwriting": true,
|
| 650 |
+
"enable_ocr": true,
|
| 651 |
+
"enable_bbox_normalization": true,
|
| 652 |
+
"enable_dataset_export": true,
|
| 653 |
+
"output_detail": "dataset"
|
| 654 |
+
}
|
| 655 |
+
}' \
|
| 656 |
+
--output documents.zip
|
| 657 |
+
```
|
| 658 |
+
|
| 659 |
+
#### ZIP File Contents
|
| 660 |
+
|
| 661 |
+
Based on `output_detail` level:
|
| 662 |
+
|
| 663 |
+
**Minimal (default):**
|
| 664 |
+
- `document_<id>.pdf` - Generated PDF files
|
| 665 |
+
- `document_<id>/` - Per-document directories with:
|
| 666 |
+
- `document.html`, `document.css` - Source files
|
| 667 |
+
- `ground_truth.json`, `bboxes.json` - Annotations
|
| 668 |
+
- `final_image.png` - Final rendered image (if Stage 3 enabled)
|
| 669 |
+
- `handwriting_regions.json`, `visual_elements.json` - Stage 3 metadata (if enabled)
|
| 670 |
+
- `ocr_results.json` - OCR word-level data (if OCR enabled)
|
| 671 |
+
- `README.md` - Package documentation
|
| 672 |
+
- `metadata.json` - Combined metadata
|
| 673 |
+
|
| 674 |
+
**Dataset (for ML training):**
|
| 675 |
+
- All files from "minimal" level, plus:
|
| 676 |
+
- `handwriting_tokens/` - Individual token images (`hw0.png`, `hw1.png`, ...)
|
| 677 |
+
- `visual_elements/` - Individual element images (`logo_0.png`, `stamp_1.png`, ...)
|
| 678 |
+
- `token_mapping.json` - Complete mapping with style IDs and positions
|
| 679 |
+
- `dataset.msgpack` - Msgpack dataset file (if export enabled)
|
| 680 |
+
- `normalized_bboxes_word.json` - Normalized coordinates (if Stage 5 enabled)
|
| 681 |
+
|
| 682 |
+
**Complete (for debugging):**
|
| 683 |
+
- All files from "dataset" level, plus:
|
| 684 |
+
- Intermediate PDFs from each processing stage
|
| 685 |
+
- Generation logs with timing information
|
| 686 |
+
- `debug_visualization.png` - Bbox overlay images
|
| 687 |
+
|
| 688 |
+
### Supported Models
|
| 689 |
+
|
| 690 |
+
- `claude-sonnet-4-5-20250929` (default, recommended)
|
| 691 |
+
- `claude-3-5-sonnet-20241022`
|
| 692 |
+
|
| 693 |
+
### Environment Variables
|
| 694 |
+
|
| 695 |
+
- `ANTHROPIC_API_KEY`: Your Anthropic API key (required if not provided in request)
|
| 696 |
+
|
| 697 |
+
## API Documentation
|
| 698 |
+
|
| 699 |
+
Interactive API documentation is available when the server is running:
|
| 700 |
+
|
| 701 |
+
- **Swagger UI**: http://localhost:8000/docs
|
| 702 |
+
- **ReDoc**: http://localhost:8000/redoc
|
| 703 |
+
|
| 704 |
+
## Error Handling
|
| 705 |
+
|
| 706 |
+
The API returns appropriate HTTP status codes:
|
| 707 |
+
|
| 708 |
+
- `200 OK`: Successful generation
|
| 709 |
+
- `400 Bad Request`: Invalid input (e.g., invalid image URLs)
|
| 710 |
+
- `401 Unauthorized`: Missing or invalid API key
|
| 711 |
+
- `500 Internal Server Error`: Processing error
|
| 712 |
+
|
| 713 |
+
Error response format:
|
| 714 |
+
```json
|
| 715 |
+
{
|
| 716 |
+
"detail": "Error message describing what went wrong"
|
| 717 |
+
}
|
| 718 |
+
```
|
| 719 |
+
|
| 720 |
+
## Performance Considerations
|
| 721 |
+
|
| 722 |
+
- **Concurrent requests**: The API can handle multiple requests concurrently
|
| 723 |
+
- **Image size**: Larger seed images take longer to process
|
| 724 |
+
- **Number of solutions**: More solutions = longer processing time
|
| 725 |
+
- **Model selection**: Sonnet is slower but higher quality than Haiku
|
| 726 |
+
|
| 727 |
+
## Limitations
|
| 728 |
+
|
| 729 |
+
- Maximum 10 seed images per request
|
| 730 |
+
- Maximum 5 document variations (`num_solutions`)
|
| 731 |
+
- Single-page documents only
|
| 732 |
+
- Timeout: 60 seconds per PDF render
|
| 733 |
+
|
| 734 |
+
## Troubleshooting
|
| 735 |
+
|
| 736 |
+
### Playwright browser not found
|
| 737 |
+
|
| 738 |
+
```bash
|
| 739 |
+
playwright install chromium
|
| 740 |
+
```
|
| 741 |
+
|
| 742 |
+
### API key not working
|
| 743 |
+
|
| 744 |
+
Make sure your API key is set correctly:
|
| 745 |
+
```bash
|
| 746 |
+
echo $ANTHROPIC_API_KEY
|
| 747 |
+
```
|
| 748 |
+
|
| 749 |
+
### PDF rendering fails
|
| 750 |
+
|
| 751 |
+
Ensure Chromium is installed and accessible:
|
| 752 |
+
```bash
|
| 753 |
+
playwright show-trace
|
| 754 |
+
```
|
| 755 |
+
|
| 756 |
+
## Integration with Frontend
|
| 757 |
+
|
| 758 |
+
Example React integration:
|
| 759 |
+
|
| 760 |
+
```jsx
|
| 761 |
+
const [loading, setLoading] = useState(false);
|
| 762 |
+
const [result, setResult] = useState(null);
|
| 763 |
+
|
| 764 |
+
const generateDocuments = async () => {
|
| 765 |
+
setLoading(true);
|
| 766 |
+
|
| 767 |
+
try {
|
| 768 |
+
const response = await fetch('http://localhost:8000/generate', {
|
| 769 |
+
method: 'POST',
|
| 770 |
+
headers: { 'Content-Type': 'application/json' },
|
| 771 |
+
body: JSON.stringify({
|
| 772 |
+
seed_images: seedImageUrls,
|
| 773 |
+
prompt_params: {
|
| 774 |
+
language: 'English',
|
| 775 |
+
doc_type: documentType,
|
| 776 |
+
num_solutions: 3
|
| 777 |
+
}
|
| 778 |
+
})
|
| 779 |
+
});
|
| 780 |
+
|
| 781 |
+
const data = await response.json();
|
| 782 |
+
setResult(data);
|
| 783 |
+
} catch (error) {
|
| 784 |
+
console.error('Generation failed:', error);
|
| 785 |
+
} finally {
|
| 786 |
+
setLoading(false);
|
| 787 |
+
}
|
| 788 |
+
};
|
| 789 |
+
```
|
| 790 |
+
|
| 791 |
+
### React Integration (Async API with Progress)
|
| 792 |
+
|
| 793 |
+
```jsx
|
| 794 |
+
import { useState, useEffect } from 'react';
|
| 795 |
+
|
| 796 |
+
function DocumentGenerator({ userId, seedImages }) {
|
| 797 |
+
const [requestId, setRequestId] = useState(null);
|
| 798 |
+
const [status, setStatus] = useState(null);
|
| 799 |
+
const [progress, setProgress] = useState(0);
|
| 800 |
+
|
| 801 |
+
// Submit job
|
| 802 |
+
const handleGenerate = async () => {
|
| 803 |
+
const response = await fetch('http://localhost:8000/generate/async', {
|
| 804 |
+
method: 'POST',
|
| 805 |
+
headers: { 'Content-Type': 'application/json' },
|
| 806 |
+
body: JSON.stringify({
|
| 807 |
+
user_id: userId,
|
| 808 |
+
seed_images: seedImages,
|
| 809 |
+
prompt_params: {
|
| 810 |
+
language: 'English',
|
| 811 |
+
doc_type: 'receipts',
|
| 812 |
+
num_solutions: 3,
|
| 813 |
+
enable_handwriting: true,
|
| 814 |
+
output_detail: 'dataset'
|
| 815 |
+
}
|
| 816 |
+
})
|
| 817 |
+
});
|
| 818 |
+
|
| 819 |
+
const job = await response.json();
|
| 820 |
+
setRequestId(job.request_id);
|
| 821 |
+
setStatus('queued');
|
| 822 |
+
};
|
| 823 |
+
|
| 824 |
+
// Poll job status
|
| 825 |
+
useEffect(() => {
|
| 826 |
+
if (!requestId || status === 'completed' || status === 'failed') return;
|
| 827 |
+
|
| 828 |
+
const interval = setInterval(async () => {
|
| 829 |
+
const response = await fetch(`http://localhost:8000/jobs/${requestId}/status`);
|
| 830 |
+
const jobStatus = await response.json();
|
| 831 |
+
|
| 832 |
+
setStatus(jobStatus.status);
|
| 833 |
+
|
| 834 |
+
// Update progress bar
|
| 835 |
+
const progressMap = {
|
| 836 |
+
'queued': 10,
|
| 837 |
+
'processing': 30,
|
| 838 |
+
'generating': 60,
|
| 839 |
+
'completed': 100,
|
| 840 |
+
'failed': 0
|
| 841 |
+
};
|
| 842 |
+
setProgress(progressMap[jobStatus.status] || 0);
|
| 843 |
+
|
| 844 |
+
if (jobStatus.status === 'completed') {
|
| 845 |
+
// Open Google Drive download link
|
| 846 |
+
window.open(jobStatus.download_url, '_blank');
|
| 847 |
+
}
|
| 848 |
+
}, 30000); // Poll every 30 seconds
|
| 849 |
+
|
| 850 |
+
return () => clearInterval(interval);
|
| 851 |
+
}, [requestId, status]);
|
| 852 |
+
|
| 853 |
+
return (
|
| 854 |
+
<div>
|
| 855 |
+
<button onClick={handleGenerate} disabled={status && status !== 'completed'}>
|
| 856 |
+
Generate Documents
|
| 857 |
+
</button>
|
| 858 |
+
|
| 859 |
+
{status && (
|
| 860 |
+
<div className="progress-container">
|
| 861 |
+
<div className="progress-bar" style={{ width: `${progress}%` }} />
|
| 862 |
+
<p>Status: {status}</p>
|
| 863 |
+
{status === 'completed' && (
|
| 864 |
+
<a href={`http://localhost:8000/jobs/${requestId}/status`}>
|
| 865 |
+
Download Results
|
| 866 |
+
</a>
|
| 867 |
+
)}
|
| 868 |
+
</div>
|
| 869 |
+
)}
|
| 870 |
+
</div>
|
| 871 |
+
);
|
| 872 |
+
}
|
| 873 |
+
```
|
| 874 |
+
|
| 875 |
+
## Background Processing Setup
|
| 876 |
+
|
| 877 |
+
The async endpoints (`/generate/async`) require a background worker system for job processing.
|
| 878 |
+
|
| 879 |
+
### Prerequisites
|
| 880 |
+
|
| 881 |
+
1. **Redis** - Job queue storage
|
| 882 |
+
2. **Supabase** - Database for job tracking and user data
|
| 883 |
+
3. **Google Drive OAuth** - For uploading results to user's Drive
|
| 884 |
+
|
| 885 |
+
### Installing Redis
|
| 886 |
+
|
| 887 |
+
**Ubuntu/Debian:**
|
| 888 |
+
```bash
|
| 889 |
+
sudo apt-get update
|
| 890 |
+
sudo apt-get install redis-server
|
| 891 |
+
sudo systemctl start redis
|
| 892 |
+
sudo systemctl enable redis
|
| 893 |
+
```
|
| 894 |
+
|
| 895 |
+
**macOS:**
|
| 896 |
+
```bash
|
| 897 |
+
brew install redis
|
| 898 |
+
brew services start redis
|
| 899 |
+
```
|
| 900 |
+
|
| 901 |
+
**Docker:**
|
| 902 |
+
```bash
|
| 903 |
+
docker run -d -p 6379:6379 --name redis redis:7-alpine
|
| 904 |
+
```
|
| 905 |
+
|
| 906 |
+
**Verify Redis is running:**
|
| 907 |
+
```bash
|
| 908 |
+
redis-cli ping
|
| 909 |
+
# Should return: PONG
|
| 910 |
+
```
|
| 911 |
+
|
| 912 |
+
### Configuring Supabase
|
| 913 |
+
|
| 914 |
+
1. Create a Supabase project at [supabase.com](https://supabase.com)
|
| 915 |
+
|
| 916 |
+
2. Create the required tables in your Supabase SQL Editor:
|
| 917 |
+
|
| 918 |
+
```sql
|
| 919 |
+
-- Document generation requests
|
| 920 |
+
CREATE TABLE document_requests (
|
| 921 |
+
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
| 922 |
+
user_id INTEGER NOT NULL,
|
| 923 |
+
status TEXT NOT NULL CHECK (status IN ('queued', 'processing', 'generating', 'completed', 'failed')),
|
| 924 |
+
request_metadata JSONB NOT NULL,
|
| 925 |
+
error_message TEXT,
|
| 926 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 927 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 928 |
+
);
|
| 929 |
+
|
| 930 |
+
-- Generated documents
|
| 931 |
+
CREATE TABLE generated_documents (
|
| 932 |
+
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
| 933 |
+
request_id UUID NOT NULL REFERENCES document_requests(id),
|
| 934 |
+
document_id TEXT NOT NULL,
|
| 935 |
+
file_url TEXT,
|
| 936 |
+
zip_url TEXT,
|
| 937 |
+
file_size_mb DECIMAL,
|
| 938 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 939 |
+
);
|
| 940 |
+
|
| 941 |
+
-- User integrations (Google Drive OAuth)
|
| 942 |
+
CREATE TABLE user_integrations (
|
| 943 |
+
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
| 944 |
+
user_id INTEGER NOT NULL,
|
| 945 |
+
integration_type TEXT NOT NULL CHECK (integration_type IN ('google_drive', 'dropbox')),
|
| 946 |
+
access_token TEXT NOT NULL,
|
| 947 |
+
refresh_token TEXT,
|
| 948 |
+
token_expiry TIMESTAMPTZ,
|
| 949 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 950 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 951 |
+
UNIQUE(user_id, integration_type)
|
| 952 |
+
);
|
| 953 |
+
|
| 954 |
+
-- Analytics events
|
| 955 |
+
CREATE TABLE analytics_events (
|
| 956 |
+
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
| 957 |
+
user_id INTEGER,
|
| 958 |
+
event_type TEXT NOT NULL,
|
| 959 |
+
entity_id UUID,
|
| 960 |
+
event_data JSONB,
|
| 961 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 962 |
+
);
|
| 963 |
+
|
| 964 |
+
-- Indexes for performance
|
| 965 |
+
CREATE INDEX idx_document_requests_user_id ON document_requests(user_id);
|
| 966 |
+
CREATE INDEX idx_document_requests_status ON document_requests(status);
|
| 967 |
+
CREATE INDEX idx_generated_documents_request_id ON generated_documents(request_id);
|
| 968 |
+
CREATE INDEX idx_user_integrations_user_id ON user_integrations(user_id);
|
| 969 |
+
CREATE INDEX idx_analytics_events_user_id ON analytics_events(user_id);
|
| 970 |
+
```
|
| 971 |
+
|
| 972 |
+
3. Add your Supabase credentials to `.env`:
|
| 973 |
+
|
| 974 |
+
```bash
|
| 975 |
+
# In api/.env
|
| 976 |
+
SUPABASE_URL=https://your-project-ref.supabase.co
|
| 977 |
+
SUPABASE_KEY=your-anon-or-service-role-key
|
| 978 |
+
```
|
| 979 |
+
|
| 980 |
+
### Configuring Google Drive OAuth
|
| 981 |
+
|
| 982 |
+
Users need to connect their Google Drive account for result storage:
|
| 983 |
+
|
| 984 |
+
1. Create a Google Cloud Project at [console.cloud.google.com](https://console.cloud.google.com)
|
| 985 |
+
2. Enable Google Drive API
|
| 986 |
+
3. Create OAuth 2.0 credentials (Web application)
|
| 987 |
+
4. Add authorized redirect URIs (e.g., `http://localhost:3000/auth/google/callback`)
|
| 988 |
+
5. Download credentials JSON
|
| 989 |
+
|
| 990 |
+
6. Users authenticate via OAuth flow (implement in your frontend):
|
| 991 |
+
|
| 992 |
+
```python
|
| 993 |
+
# Example OAuth flow (implement in your auth system)
|
| 994 |
+
from google_auth_oauthlib.flow import Flow
|
| 995 |
+
|
| 996 |
+
flow = Flow.from_client_config(
|
| 997 |
+
client_config={
|
| 998 |
+
"web": {
|
| 999 |
+
"client_id": "YOUR_CLIENT_ID",
|
| 1000 |
+
"client_secret": "YOUR_CLIENT_SECRET",
|
| 1001 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
| 1002 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
| 1003 |
+
"redirect_uris": ["http://localhost:3000/auth/google/callback"]
|
| 1004 |
+
}
|
| 1005 |
+
},
|
| 1006 |
+
scopes=["https://www.googleapis.com/auth/drive.file"]
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
# User visits auth URL, gets redirected back with code
|
| 1010 |
+
authorization_url, state = flow.authorization_url(access_type='offline', include_granted_scopes='true')
|
| 1011 |
+
|
| 1012 |
+
# Exchange code for tokens
|
| 1013 |
+
flow.fetch_token(code=authorization_code)
|
| 1014 |
+
credentials = flow.credentials
|
| 1015 |
+
|
| 1016 |
+
# Store in Supabase user_integrations table
|
| 1017 |
+
supabase.table('user_integrations').insert({
|
| 1018 |
+
'user_id': user_id,
|
| 1019 |
+
'integration_type': 'google_drive',
|
| 1020 |
+
'access_token': credentials.token,
|
| 1021 |
+
'refresh_token': credentials.refresh_token,
|
| 1022 |
+
'token_expiry': credentials.expiry
|
| 1023 |
+
}).execute()
|
| 1024 |
+
```
|
| 1025 |
+
|
| 1026 |
+
### Starting the Background Worker
|
| 1027 |
+
|
| 1028 |
+
1. Configure environment variables in `api/.env`:
|
| 1029 |
+
|
| 1030 |
+
```bash
|
| 1031 |
+
# Redis Configuration
|
| 1032 |
+
REDIS_URL=redis://localhost:6379/0
|
| 1033 |
+
RQ_QUEUE_NAME=docgenie
|
| 1034 |
+
|
| 1035 |
+
# Batch Processing
|
| 1036 |
+
BATCH_POLL_INTERVAL=30 # seconds
|
| 1037 |
+
BATCH_DATA_DIR=/tmp/docgenie_batches
|
| 1038 |
+
MESSAGE_DATA_DIR=/tmp/docgenie_messages
|
| 1039 |
+
|
| 1040 |
+
# Google Drive
|
| 1041 |
+
GOOGLE_DRIVE_FOLDER_NAME=DocGenie Documents
|
| 1042 |
+
|
| 1043 |
+
# Supabase (already configured above)
|
| 1044 |
+
SUPABASE_URL=https://your-project.supabase.co
|
| 1045 |
+
SUPABASE_KEY=your_key_here
|
| 1046 |
+
|
| 1047 |
+
# Claude API
|
| 1048 |
+
ANTHROPIC_API_KEY=your_api_key_here
|
| 1049 |
+
```
|
| 1050 |
+
|
| 1051 |
+
2. Start the worker:
|
| 1052 |
+
|
| 1053 |
+
```bash
|
| 1054 |
+
cd api/
|
| 1055 |
+
./start_worker.sh
|
| 1056 |
+
```
|
| 1057 |
+
|
| 1058 |
+
The worker will:
|
| 1059 |
+
- ✓ Check Redis connection
|
| 1060 |
+
- ✓ Validate Supabase configuration
|
| 1061 |
+
- ✓ Verify Claude API key
|
| 1062 |
+
- ✓ Create temporary directories
|
| 1063 |
+
- ✓ Start RQ worker listening on `docgenie` queue
|
| 1064 |
+
|
| 1065 |
+
**Output:**
|
| 1066 |
+
```
|
| 1067 |
+
🚀 Starting DocGenie RQ Worker...
|
| 1068 |
+
✓ Loading .env file...
|
| 1069 |
+
✓ Redis connected
|
| 1070 |
+
✓ Supabase configured
|
| 1071 |
+
✓ Claude API key configured
|
| 1072 |
+
✓ Temporary directories created
|
| 1073 |
+
|
| 1074 |
+
============================================
|
| 1075 |
+
Worker Configuration:
|
| 1076 |
+
Queue: docgenie
|
| 1077 |
+
Redis: redis://localhost:6379/0
|
| 1078 |
+
Batch Data: /tmp/docgenie_batches
|
| 1079 |
+
Message Data: /tmp/docgenie_messages
|
| 1080 |
+
============================================
|
| 1081 |
+
|
| 1082 |
+
✅ Starting RQ worker (press Ctrl+C to stop)...
|
| 1083 |
+
|
| 1084 |
+
12:00:00 RQ worker 'worker-abc123' started on docgenie queue
|
| 1085 |
+
```
|
| 1086 |
+
|
| 1087 |
+
### Running Multiple Workers (Production)
|
| 1088 |
+
|
| 1089 |
+
For production systems with high load, run multiple workers:
|
| 1090 |
+
|
| 1091 |
+
```bash
|
| 1092 |
+
# Terminal 1
|
| 1093 |
+
./start_worker.sh
|
| 1094 |
+
|
| 1095 |
+
# Terminal 2
|
| 1096 |
+
./start_worker.sh
|
| 1097 |
+
|
| 1098 |
+
# Terminal 3
|
| 1099 |
+
./start_worker.sh
|
| 1100 |
+
```
|
| 1101 |
+
|
| 1102 |
+
Each worker processes jobs independently from the same queue.
|
| 1103 |
+
|
| 1104 |
+
**For detailed scaling instructions**, see [SCALING.md](SCALING.md).
|
| 1105 |
+
|
| 1106 |
+
### Monitoring Workers
|
| 1107 |
+
|
| 1108 |
+
```bash
|
| 1109 |
+
# View worker status
|
| 1110 |
+
rq info --url redis://localhost:6379/0
|
| 1111 |
+
|
| 1112 |
+
# View queue status
|
| 1113 |
+
rq info --queue docgenie --url redis://localhost:6379/0
|
| 1114 |
+
|
| 1115 |
+
# View failed jobs
|
| 1116 |
+
rq info --queue failed --url redis://localhost:6379/0
|
| 1117 |
+
```
|
| 1118 |
+
|
| 1119 |
+
### Architecture Overview
|
| 1120 |
+
|
| 1121 |
+
```
|
| 1122 |
+
┌─────────────┐ ┌─────────────┐ ┌─────────────────┐
|
| 1123 |
+
│ FastAPI │───────▶│ Redis │◀───────│ RQ Workers │
|
| 1124 |
+
│ Server │ │ Queue │ │ (1-5 instances)│
|
| 1125 |
+
│ │ │ │ │ │
|
| 1126 |
+
│ /generate/ │ │ Job Queue: │ │ • Downloads │
|
| 1127 |
+
│ async │ │ - queued │ │ • Claude Batch │
|
| 1128 |
+
│ │ │ - pending │ │ • PDF render │
|
| 1129 |
+
│ /jobs/ │ │ - active │ │ • Handwriting │
|
| 1130 |
+
│ {id}/ │ │ │ │ • OCR │
|
| 1131 |
+
│ status │ │ │ │ • ZIP creation │
|
| 1132 |
+
└──────┬──────┘ └─────────────┘ └────────┬────────┘
|
| 1133 |
+
│ │
|
| 1134 |
+
│ │
|
| 1135 |
+
▼ ▼
|
| 1136 |
+
┌──────────────────────────────────────────────────────────────┐
|
| 1137 |
+
│ Supabase │
|
| 1138 |
+
│ • document_requests (job tracking) │
|
| 1139 |
+
│ • generated_documents (results metadata) │
|
| 1140 |
+
│ • user_integrations (Google Drive OAuth) │
|
| 1141 |
+
│ • analytics_events (usage tracking) │
|
| 1142 |
+
└───────────────────────────────────────────────────────────────┘
|
| 1143 |
+
│
|
| 1144 |
+
│ Upload Results
|
| 1145 |
+
▼
|
| 1146 |
+
┌──────────────────────────────────────────────────────────────┐
|
| 1147 |
+
│ Google Drive │
|
| 1148 |
+
│ • User's "DocGenie Documents" folder │
|
| 1149 |
+
│ • ZIP files with generated documents │
|
| 1150 |
+
│ • Shareable links returned to API │
|
| 1151 |
+
└──────────────────────────────────────────────────────────────┘
|
| 1152 |
+
```
|
| 1153 |
+
|
| 1154 |
+
### Cost Comparison: Direct vs Batched API
|
| 1155 |
+
|
| 1156 |
+
| API Type | Cost (Input) | Cost (Output) | Latency | Use Case |
|
| 1157 |
+
|----------|-------------|---------------|---------|----------|
|
| 1158 |
+
| Direct | $5.00/1M tokens | $15.00/1M tokens | 30-120s | Real-time, interactive |
|
| 1159 |
+
| **Batched** | **$2.50/1M tokens** | **$7.50/1M tokens** | 5-30 min | **Background jobs (recommended)** |
|
| 1160 |
+
|
| 1161 |
+
**Example Cost Calculation:**
|
| 1162 |
+
- Generate 100 documents per day
|
| 1163 |
+
- Each request: 5,000 input tokens, 10,000 output tokens
|
| 1164 |
+
|
| 1165 |
+
**Direct API Cost:**
|
| 1166 |
+
- Input: (100 × 5,000 / 1M) × $5.00 = $2.50/day
|
| 1167 |
+
- Output: (100 × 10,000 / 1M) × $15.00 = $15.00/day
|
| 1168 |
+
- **Total: $17.50/day = $525/month**
|
| 1169 |
+
|
| 1170 |
+
**Batched API Cost:**
|
| 1171 |
+
- Input: (100 × 5,000 / 1M) × $2.50 = $1.25/day
|
| 1172 |
+
- Output: (100 × 10,000 / 1M) × $7.50 = $7.50/day
|
| 1173 |
+
- **Total: $8.75/day = $262.50/month**
|
| 1174 |
+
|
| 1175 |
+
**💰 Savings: $262.50/month (50% reduction)**
|
| 1176 |
+
|
| 1177 |
+
## Scaling Workers
|
| 1178 |
+
|
| 1179 |
+
The API uses Redis Queue (RQ) workers for background job processing. Scale workers based on load:
|
| 1180 |
+
|
| 1181 |
+
| User Load | Workers | Redis RAM | Notes |
|
| 1182 |
+
|-----------|---------|-----------|-------|
|
| 1183 |
+
| < 10 req/hr | 1 | 256 MB | Development |
|
| 1184 |
+
| 10–50 req/hr | 2–3 | 512 MB | Small production |
|
| 1185 |
+
| 50–200 req/hr | 3–5 | 1 GB | Medium production |
|
| 1186 |
+
| > 200 req/hr | 5+ | 2+ GB | Large production |
|
| 1187 |
+
|
| 1188 |
+
### Starting Workers
|
| 1189 |
+
|
| 1190 |
+
```bash
|
| 1191 |
+
# Single worker (development)
|
| 1192 |
+
./start_worker.sh
|
| 1193 |
+
|
| 1194 |
+
# Multiple workers (production) — run in separate terminals
|
| 1195 |
+
./start_worker.sh # Terminal 1
|
| 1196 |
+
./start_worker.sh # Terminal 2
|
| 1197 |
+
|
| 1198 |
+
# Docker Compose — scale to 3 workers
|
| 1199 |
+
docker-compose up --scale worker=3
|
| 1200 |
+
|
| 1201 |
+
# Monitor
|
| 1202 |
+
rq info --url redis://localhost:6379/0
|
| 1203 |
+
rq info --queue docgenie --url redis://localhost:6379/0
|
| 1204 |
+
```
|
| 1205 |
+
|
| 1206 |
+
### Railway Multi-Worker (Separate Service)
|
| 1207 |
+
1. Railway dashboard → New Service → GitHub Repo (same repo)
|
| 1208 |
+
2. Name: `docgenie-worker`
|
| 1209 |
+
3. Custom Start Command: `rq worker --url $REDIS_URL`
|
| 1210 |
+
4. Add the same environment variables as the API service
|
| 1211 |
+
|
| 1212 |
+
> For most use cases the **combined** mode (API + worker in one service, see `railway.json`) is sufficient and cheaper.
|
| 1213 |
+
|
| 1214 |
+
## Contributing
|
| 1215 |
+
|
| 1216 |
+
This API is a simplified interface to the DocGenie pipeline. For the full pipeline with all features, see the main DocGenie documentation.
|
| 1217 |
+
|
| 1218 |
+
## License
|
| 1219 |
+
|
| 1220 |
+
Same as DocGenie main project.
|
api/TESTING.md
ADDED
|
@@ -0,0 +1,936 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Testing Guide: DocGenie API
|
| 2 |
+
|
| 3 |
+
Complete guide for testing the document generation API endpoints with Google Drive integration.
|
| 4 |
+
|
| 5 |
+
## Table of Contents
|
| 6 |
+
|
| 7 |
+
1. [Prerequisites](#prerequisites)
|
| 8 |
+
2. [Quick Start](#quick-start)
|
| 9 |
+
3. [Getting Google Drive Token](#getting-google-drive-token)
|
| 10 |
+
4. [Testing Async API](#testing-async-api)
|
| 11 |
+
5. [Testing Sync PDF API](#testing-sync-pdf-api)
|
| 12 |
+
6. [Manual Testing with cURL](#manual-testing-with-curl)
|
| 13 |
+
7. [Frontend Integration Example](#frontend-integration-example)
|
| 14 |
+
8. [Troubleshooting](#troubleshooting)
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## Prerequisites
|
| 19 |
+
|
| 20 |
+
### 1. Start Required Services
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
# Terminal 1: Start Redis
|
| 24 |
+
|
| 25 |
+
## Option A: Local Redis (Recommended for Development)
|
| 26 |
+
# Install Redis (Ubuntu/Debian)
|
| 27 |
+
sudo apt-get update && sudo apt-get install redis-server -y
|
| 28 |
+
sudo systemctl start redis-server
|
| 29 |
+
sudo systemctl enable redis-server
|
| 30 |
+
|
| 31 |
+
# Verify Redis is running
|
| 32 |
+
redis-cli ping # Should return "PONG"
|
| 33 |
+
|
| 34 |
+
## Option B: Docker (if Docker is installed)
|
| 35 |
+
# docker run -d -p 6379:6379 --name redis redis:7-alpine
|
| 36 |
+
|
| 37 |
+
# Terminal 2: Start FastAPI Server
|
| 38 |
+
cd docgenie/api
|
| 39 |
+
python main.py
|
| 40 |
+
|
| 41 |
+
# Terminal 3: Start RQ Worker
|
| 42 |
+
cd docgenie/api
|
| 43 |
+
./start_worker.sh
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### 2. Configure Environment
|
| 47 |
+
|
| 48 |
+
Make sure your `api/.env` file has:
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
# Required
|
| 52 |
+
ANTHROPIC_API_KEY=your_claude_api_key
|
| 53 |
+
SUPABASE_URL=https://your-project.supabase.co
|
| 54 |
+
SUPABASE_KEY=your_supabase_key
|
| 55 |
+
REDIS_URL=redis://localhost:6379/0
|
| 56 |
+
|
| 57 |
+
# Optional (for token refresh)
|
| 58 |
+
GOOGLE_CLIENT_ID=your_client_id.apps.googleusercontent.com
|
| 59 |
+
GOOGLE_CLIENT_SECRET=your_client_secret
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
### 3. Create Supabase Tables
|
| 63 |
+
|
| 64 |
+
Run the SQL from [DEPLOYMENT.md](DEPLOYMENT.md#32-create-database-schema) in your Supabase SQL Editor.
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## Quick Start
|
| 69 |
+
|
| 70 |
+
### Option 1: Using Test Script (Easiest)
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
# Get Google Drive token first (one-time setup)
|
| 74 |
+
python api/test_get_google_token.py \
|
| 75 |
+
--client-id YOUR_CLIENT_ID \
|
| 76 |
+
--client-secret YOUR_CLIENT_SECRET
|
| 77 |
+
|
| 78 |
+
# Copy the access token, then run test
|
| 79 |
+
python api/test_async_api.py --google-token YOUR_ACCESS_TOKEN
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
### Option 2: Using OAuth Playground (Quick Test)
|
| 83 |
+
|
| 84 |
+
1. Go to [OAuth Playground](https://developers.google.com/oauthplayground/)
|
| 85 |
+
2. Configure with your credentials
|
| 86 |
+
3. Get access token
|
| 87 |
+
4. Run test script with token
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## Getting Google Drive Token
|
| 92 |
+
|
| 93 |
+
### Method 1: Using Helper Script (Recommended)
|
| 94 |
+
|
| 95 |
+
Our helper script automates the OAuth flow:
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
cd docgenie/api
|
| 99 |
+
|
| 100 |
+
python test_get_google_token.py \
|
| 101 |
+
--client-id YOUR_GOOGLE_CLIENT_ID \
|
| 102 |
+
--client-secret YOUR_GOOGLE_CLIENT_SECRET
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
**What it does:**
|
| 106 |
+
1. Opens browser for Google authorization
|
| 107 |
+
2. Starts local server on port 8080 for callback
|
| 108 |
+
3. Exchanges authorization code for tokens
|
| 109 |
+
4. Displays access token and refresh token
|
| 110 |
+
|
| 111 |
+
**Output:**
|
| 112 |
+
```
|
| 113 |
+
Access Token: ya29.a0AfH6SMBx...
|
| 114 |
+
Refresh Token: 1//0gw...
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### Method 2: OAuth Playground (No Code)
|
| 118 |
+
|
| 119 |
+
1. **Go to**: https://developers.google.com/oauthplayground/
|
| 120 |
+
|
| 121 |
+
2. **Configure Credentials**:
|
| 122 |
+
- Click gear icon (⚙) in top right
|
| 123 |
+
- Check "Use your own OAuth credentials"
|
| 124 |
+
- Enter your Client ID and Client Secret
|
| 125 |
+
|
| 126 |
+
3. **Authorize API**:
|
| 127 |
+
- In left panel, scroll to "Drive API v3"
|
| 128 |
+
- Select: `https://www.googleapis.com/auth/drive.file`
|
| 129 |
+
- Click "Authorize APIs"
|
| 130 |
+
- Sign in with your Google account
|
| 131 |
+
|
| 132 |
+
4. **Get Token**:
|
| 133 |
+
- Click "Exchange authorization code for tokens"
|
| 134 |
+
- Copy the "Access token" value
|
| 135 |
+
|
| 136 |
+
### Method 3: Manual cURL (For Advanced Users)
|
| 137 |
+
|
| 138 |
+
**Step 1: Get Authorization Code**
|
| 139 |
+
|
| 140 |
+
Open this URL in browser (replace YOUR_CLIENT_ID):
|
| 141 |
+
|
| 142 |
+
```
|
| 143 |
+
https://accounts.google.com/o/oauth2/v2/auth?client_id=YOUR_CLIENT_ID&redirect_uri=http://localhost:8080&response_type=code&scope=https://www.googleapis.com/auth/drive.file&access_type=offline&prompt=consent
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
**Step 2: Exchange Code for Token**
|
| 147 |
+
|
| 148 |
+
After authorization, you'll be redirected to:
|
| 149 |
+
```
|
| 150 |
+
http://localhost:8080/?code=AUTHORIZATION_CODE
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
Exchange the code:
|
| 154 |
+
|
| 155 |
+
```bash
|
| 156 |
+
curl -X POST https://oauth2.googleapis.com/token \
|
| 157 |
+
-d "code=AUTHORIZATION_CODE" \
|
| 158 |
+
-d "client_id=YOUR_CLIENT_ID" \
|
| 159 |
+
-d "client_secret=YOUR_CLIENT_SECRET" \
|
| 160 |
+
-d "redirect_uri=http://localhost:8080" \
|
| 161 |
+
-d "grant_type=authorization_code"
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
Response:
|
| 165 |
+
```json
|
| 166 |
+
{
|
| 167 |
+
"access_token": "ya29.a0AfH6SMBx...",
|
| 168 |
+
"refresh_token": "1//0gw...",
|
| 169 |
+
"expires_in": 3600,
|
| 170 |
+
"token_type": "Bearer"
|
| 171 |
+
}
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## Testing Async API
|
| 177 |
+
|
| 178 |
+
The async API (`/generate/async`) is optimized for batch processing with 50% cost savings. Jobs are queued and processed in the background, with status polling.
|
| 179 |
+
|
| 180 |
+
### Full Automated Test
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
cd docgenie/api
|
| 184 |
+
|
| 185 |
+
# Set token as environment variable
|
| 186 |
+
export GOOGLE_DRIVE_TOKEN="ya29.a0AfH6SMBx..."
|
| 187 |
+
|
| 188 |
+
# Run test (generates 2 documents by default)
|
| 189 |
+
python test_async_api.py
|
| 190 |
+
|
| 191 |
+
# Or pass token directly
|
| 192 |
+
python test_async_api.py --google-token "ya29.a0AfH6SMBx..."
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
**Test Flow:**
|
| 196 |
+
1. ✓ Health check
|
| 197 |
+
2. ✓ Submit async job
|
| 198 |
+
3. ✓ Poll status (every 30 seconds)
|
| 199 |
+
4. ✓ List user jobs
|
| 200 |
+
5. ✓ Display Google Drive link
|
| 201 |
+
|
| 202 |
+
**Expected Output:**
|
| 203 |
+
```
|
| 204 |
+
================================================================================
|
| 205 |
+
ASYNC API TEST SUITE
|
| 206 |
+
================================================================================
|
| 207 |
+
Base URL: http://localhost:8000
|
| 208 |
+
User ID: 1
|
| 209 |
+
Documents to Generate: 2
|
| 210 |
+
================================================================================
|
| 211 |
+
|
| 212 |
+
============================================================
|
| 213 |
+
1. Testing API Health
|
| 214 |
+
============================================================
|
| 215 |
+
✓ API is healthy: {'status': 'healthy', 'version': '1.0.0'}
|
| 216 |
+
|
| 217 |
+
============================================================
|
| 218 |
+
2. Submitting Async Job
|
| 219 |
+
============================================================
|
| 220 |
+
Payload:
|
| 221 |
+
User ID: 1
|
| 222 |
+
Seed Images: 1
|
| 223 |
+
Num Solutions: 2
|
| 224 |
+
Google Token: ya29.a0AfH6SMBx...
|
| 225 |
+
|
| 226 |
+
✓ Job submitted successfully!
|
| 227 |
+
Request ID: 550e8400-e29b-41d4-a716-446655440000
|
| 228 |
+
Status: queued
|
| 229 |
+
Estimated Time: 10 minutes
|
| 230 |
+
Poll URL: /jobs/550e8400-e29b-41d4-a716-446655440000/status
|
| 231 |
+
|
| 232 |
+
============================================================
|
| 233 |
+
3. Polling Job Status
|
| 234 |
+
============================================================
|
| 235 |
+
Polling every 30 seconds (max 60 attempts)
|
| 236 |
+
Status flow: queued → processing → generating → completed/failed
|
| 237 |
+
|
| 238 |
+
[12:00:00] Poll 1/60: QUEUED
|
| 239 |
+
[12:00:30] Poll 2/60: PROCESSING - Creating batch request...
|
| 240 |
+
[12:01:00] Poll 3/60: GENERATING - Batch submitted to Claude...
|
| 241 |
+
[12:08:30] Poll 17/60: GENERATING - Polling batch status...
|
| 242 |
+
[12:15:00] Poll 30/60: COMPLETED
|
| 243 |
+
|
| 244 |
+
============================================================
|
| 245 |
+
✓ JOB COMPLETED!
|
| 246 |
+
============================================================
|
| 247 |
+
Download URL: https://drive.google.com/file/d/abc123xyz/view?usp=sharing
|
| 248 |
+
File Size: 15.4 MB
|
| 249 |
+
Document Count: 2
|
| 250 |
+
Created: 2026-02-28T12:00:00Z
|
| 251 |
+
Completed: 2026-02-28T12:15:00Z
|
| 252 |
+
|
| 253 |
+
================================================================================
|
| 254 |
+
TEST SUMMARY
|
| 255 |
+
================================================================================
|
| 256 |
+
✓ ALL TESTS PASSED!
|
| 257 |
+
|
| 258 |
+
Your documents are available at:
|
| 259 |
+
https://drive.google.com/file/d/abc123xyz/view?usp=sharing
|
| 260 |
+
|
| 261 |
+
Next steps:
|
| 262 |
+
1. Open the Google Drive link in your browser
|
| 263 |
+
2. Download the ZIP file
|
| 264 |
+
3. Extract and verify generated documents
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
### Test Options
|
| 268 |
+
|
| 269 |
+
```bash
|
| 270 |
+
# Custom number of documents
|
| 271 |
+
python test_async_api.py --google-token TOKEN --num-solutions 5
|
| 272 |
+
|
| 273 |
+
# Custom API URL (if deployed)
|
| 274 |
+
python test_async_api.py --google-token TOKEN --base-url https://api.yourdomain.com
|
| 275 |
+
|
| 276 |
+
# Different user ID
|
| 277 |
+
python test_async_api.py --google-token TOKEN --user-id 42
|
| 278 |
+
|
| 279 |
+
# With refresh token
|
| 280 |
+
python test_async_api.py \
|
| 281 |
+
--google-token ACCESS_TOKEN \
|
| 282 |
+
--google-refresh-token REFRESH_TOKEN
|
| 283 |
+
|
| 284 |
+
# Show help for getting token
|
| 285 |
+
python test_async_api.py --help-token
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
---Testing Sync PDF API
|
| 289 |
+
|
| 290 |
+
The sync PDF API (`/generate/pdf`) returns results immediately (20-60s) and supports three modes of operation. Perfect for smaller batch sizes and real-time workflows.
|
| 291 |
+
|
| 292 |
+
### Three Operating Modes
|
| 293 |
+
|
| 294 |
+
**Mode 1: Quick Demo (No Tracking)**
|
| 295 |
+
- Returns ZIP immediately
|
| 296 |
+
- No Supabase records created
|
| 297 |
+
- Perfect for quick testing and demos
|
| 298 |
+
- No user_id required
|
| 299 |
+
|
| 300 |
+
**Mode 2: Demo with Tracking**
|
| 301 |
+
- Returns ZIP immediately
|
| 302 |
+
- Creates Supabase record for tracking
|
| 303 |
+
- Can poll status during generation
|
| 304 |
+
- Requires user_id
|
| 305 |
+
|
| 306 |
+
**Mode 3: Full Production**
|
| 307 |
+
- Returns ZIP immediately
|
| 308 |
+
- Creates Supabase record
|
| 309 |
+
- Uploads to Google Drive in background
|
| 310 |
+
- Requires user_id + google_drive_token
|
| 311 |
+
- Best for production use
|
| 312 |
+
|
| 313 |
+
### Full Automated Test
|
| 314 |
+
|
| 315 |
+
```bash
|
| 316 |
+
cd docgenie/api
|
| 317 |
+
|
| 318 |
+
# Mode 1: Quick demo (no tracking)
|
| 319 |
+
python test_sync_pdf_api.py
|
| 320 |
+
|
| 321 |
+
# Mode 2: Demo with tracking
|
| 322 |
+
python test_sync_pdf_api.py --user-id 123
|
| 323 |
+
|
| 324 |
+
# Mode 3: Full production (tracking + GDrive)
|
| 325 |
+
python test_sync_pdf_api.py \
|
| 326 |
+
--user-id 123 \
|
| 327 |
+
--google-token "ya29.a0AfH6SMBx..." \
|
| 328 |
+
--google-refresh-token "1//0gw..."
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
**Test Flow for All Modes:**
|
| 332 |
+
1. ✓ Health check
|
| 333 |
+
2. ✓ Test Mode 1: Quick demo (always runs)
|
| 334 |
+
3. ✓ Test Mode 2: With tracking (if user_id provided)
|
| 335 |
+
4. ✓ Test Mode 3: Full production (if user_id + token provided)
|
| 336 |
+
5. ✓ Validate ZIP contents
|
| 337 |
+
6. ✓ Test status polling (Modes 2 & 3)
|
| 338 |
+
7. ✓ Verify GDrive upload (Mode 3)
|
| 339 |
+
|
| 340 |
+
**Expected Output:**
|
| 341 |
+
```
|
| 342 |
+
================================================================================
|
| 343 |
+
DocGenie /generate/pdf Endpoint Test Suite
|
| 344 |
+
================================================================================
|
| 345 |
+
|
| 346 |
+
================================================================================
|
| 347 |
+
1. Testing API Health
|
| 348 |
+
================================================================================
|
| 349 |
+
✓ API is healthy: {'status': 'healthy', 'version': '1.0.0'}
|
| 350 |
+
|
| 351 |
+
================================================================================
|
| 352 |
+
2. Testing Mode 1: Quick Demo (No Tracking)
|
| 353 |
+
================================================================================
|
| 354 |
+
This mode returns ZIP immediately without creating Supabase records.
|
| 355 |
+
Use for quick testing and demos.
|
| 356 |
+
|
| 357 |
+
Payload:
|
| 358 |
+
Seed Images: 1
|
| 359 |
+
Num Solutions: 1
|
| 360 |
+
User ID: None (no tracking)
|
| 361 |
+
Google Token: None
|
| 362 |
+
|
| 363 |
+
⏳ Calling /generate/pdf (expect 20-60 seconds)...
|
| 364 |
+
|
| 365 |
+
✓ Response received in 42.3 seconds
|
| 366 |
+
|
| 367 |
+
Response Headers:
|
| 368 |
+
Content-Type: application/zip
|
| 369 |
+
Content-Disposition: attachment; filename=docgenie_documents.zip
|
| 370 |
+
X-Request-ID: NOT SET (expected in mode 1)
|
| 371 |
+
X-Status-URL: NOT SET (expected in mode 1)
|
| 372 |
+
|
| 373 |
+
✓ ZIP file size: 145.2 KB
|
| 374 |
+
✓ ZIP contains 18 files:
|
| 375 |
+
- README.md
|
| 376 |
+
- metadata.json
|
| 377 |
+
- analysis/document_1.json
|
| 378 |
+
- annotations/gt/document_1.json
|
| 379 |
+
- bbox/bbox_pdf/word/document_1.json
|
| 380 |
+
- html/document_1.css
|
| 381 |
+
- html/document_1.html
|
| 382 |
+
- img/document_1.png
|
| 383 |
+
- pdf/pdf_final/document_1.pdf
|
| 384 |
+
- pdf/pdf_initial/document_1.pdf
|
| 385 |
+
✓ Contains metadata.json
|
| 386 |
+
✓ Contains README.md
|
| 387 |
+
|
| 388 |
+
✅ Mode 1 (Quick Demo) Test PASSED
|
| 389 |
+
⚡ Fast response: 42.3s
|
| 390 |
+
📦 Valid ZIP file
|
| 391 |
+
✓ No tracking overhead
|
| 392 |
+
|
| 393 |
+
================================================================================
|
| 394 |
+
3. Testing Mode 2: Demo with Progress Tracking
|
| 395 |
+
================================================================================
|
| 396 |
+
This mode returns ZIP immediately AND creates Supabase record.
|
| 397 |
+
Client can poll /jobs/{request_id}/status during generation.
|
| 398 |
+
|
| 399 |
+
Payload:
|
| 400 |
+
User ID: 123 (tracking enabled)
|
| 401 |
+
Seed Images: 1
|
| 402 |
+
Num Solutions: 2
|
| 403 |
+
Google Token: None
|
| 404 |
+
|
| 405 |
+
⏳ Calling /generate/pdf (expect 20-60 seconds)...
|
| 406 |
+
|
| 407 |
+
✓ Response received in 58.7 seconds
|
| 408 |
+
|
| 409 |
+
Response Headers:
|
| 410 |
+
Content-Type: application/zip
|
| 411 |
+
✓ X-Request-ID: 550e8400-e29b-41d4-a716-446655440000
|
| 412 |
+
✓ X-Status-URL: /jobs/550e8400-e29b-41d4-a716-446655440000/status
|
| 413 |
+
|
| 414 |
+
✓ ZIP file size: 287.4 KB
|
| 415 |
+
✓ ZIP contains 32 files
|
| 416 |
+
✓ Found 4 PDF files
|
| 417 |
+
|
| 418 |
+
⏳ Testing status polling endpoint...
|
| 419 |
+
✓ Status endpoint working:
|
| 420 |
+
Request ID: 550e8400-e29b-41d4-a716-446655440000
|
| 421 |
+
Status: completed
|
| 422 |
+
Created: 2026-03-01T10:15:00Z
|
| 423 |
+
Updated: 2026-03-01T10:15:58Z
|
| 424 |
+
✓ Job marked as completed
|
| 425 |
+
|
| 426 |
+
✅ Mode 2 (Tracking) Test PASSED
|
| 427 |
+
⚡ Fast response: 58.7s
|
| 428 |
+
📦 Valid ZIP file
|
| 429 |
+
📊 Progress tracking enabled
|
| 430 |
+
✓ Can poll status during generation
|
| 431 |
+
|
| 432 |
+
================================================================================
|
| 433 |
+
4. Testing Mode 3: Full Production (Tracking + GDrive Upload)
|
| 434 |
+
================================================================================
|
| 435 |
+
This mode returns ZIP immediately AND uploads to Google Drive in background.
|
| 436 |
+
Best for production use with full tracking and backup.
|
| 437 |
+
|
| 438 |
+
Payload:
|
| 439 |
+
User ID: 123
|
| 440 |
+
Google Token: ya29.a0AfH6SMBx...
|
| 441 |
+
Google Refresh: Yes
|
| 442 |
+
Seed Images: 1
|
| 443 |
+
Num Solutions: 1
|
| 444 |
+
|
| 445 |
+
⏳ Calling /generate/pdf (expect 20-60 seconds)...
|
| 446 |
+
|
| 447 |
+
✓ Response received in 45.1 seconds
|
| 448 |
+
|
| 449 |
+
Response Headers:
|
| 450 |
+
✓ X-Request-ID: 660f9511-f3ac-52e5-b827-557766551111
|
| 451 |
+
✓ X-Status-URL: /jobs/660f9511-f3ac-52e5-b827-557766551111/status
|
| 452 |
+
|
| 453 |
+
✓ ZIP file size: 151.8 KB
|
| 454 |
+
✓ ZIP contains 18 files
|
| 455 |
+
|
| 456 |
+
⏳ ZIP returned immediately, GDrive upload happening in background...
|
| 457 |
+
(This doesn't block the response)
|
| 458 |
+
|
| 459 |
+
⏳ Waiting 10 seconds for background GDrive upload...
|
| 460 |
+
✓ Status after background upload:
|
| 461 |
+
Status: completed
|
| 462 |
+
✓ GDrive URL: https://drive.google.com/file/d/abc123xyz/view?usp=...
|
| 463 |
+
✓ Background upload completed!
|
| 464 |
+
|
| 465 |
+
✅ Mode 3 (Full Production) Test PASSED
|
| 466 |
+
⚡ Fast response: 45.1s (GDrive doesn't block)
|
| 467 |
+
📦 Valid ZIP file delivered immediately
|
| 468 |
+
📊 Progress tracking enabled
|
| 469 |
+
☁️ Google Drive backup scheduled
|
| 470 |
+
✓ Production-ready configuration
|
| 471 |
+
|
| 472 |
+
================================================================================
|
| 473 |
+
TEST SUMMARY
|
| 474 |
+
================================================================================
|
| 475 |
+
✅ health: PASSED
|
| 476 |
+
✅ mode_1: PASSED
|
| 477 |
+
✅ mode_2: PASSED
|
| 478 |
+
✅ mode_3: PASSED
|
| 479 |
+
|
| 480 |
+
4/4 tests passed
|
| 481 |
+
|
| 482 |
+
🎉 All tests passed!
|
| 483 |
+
================================================================================
|
| 484 |
+
```
|
| 485 |
+
|
| 486 |
+
### Test Options
|
| 487 |
+
|
| 488 |
+
```bash
|
| 489 |
+
# Mode 1 only (default)
|
| 490 |
+
python test_sync_pdf_api.py
|
| 491 |
+
|
| 492 |
+
# Mode 2 with custom user ID
|
| 493 |
+
python test_sync_pdf_api.py --user-id 456
|
| 494 |
+
|
| 495 |
+
# Mode 3 with custom API URL
|
| 496 |
+
python test_sync_pdf_api.py \
|
| 497 |
+
--base-url https://api.yourdomain.com \
|
| 498 |
+
--user-id 123 \
|
| 499 |
+
--google-token TOKEN \
|
| 500 |
+
--google-refresh-token REFRESH_TOKEN
|
| 501 |
+
```
|
| 502 |
+
|
| 503 |
+
### Comparing Sync vs Async
|
| 504 |
+
|
| 505 |
+
| Feature | Sync (`/generate/pdf`) | Async (`/generate/async`) |
|
| 506 |
+
|---------|------------------------|---------------------------|
|
| 507 |
+
| **Response Time** | 20-60 seconds | 5-30 minutes |
|
| 508 |
+
| **Best For** | 1-3 documents | 5-50+ documents |
|
| 509 |
+
| **Cost** | Standard API pricing | 50% cheaper (Batch API) |
|
| 510 |
+
| **Result Delivery** | Direct ZIP download | Google Drive upload |
|
| 511 |
+
| **Progress Tracking** | Optional (Modes 2 & 3) | Always enabled |
|
| 512 |
+
| **Use Case** | Real-time workflows, demos | Bulk generation, scheduled jobs |
|
| 513 |
+
|
| 514 |
+
**When to use Sync:**
|
| 515 |
+
- Generating 1-3 documents
|
| 516 |
+
- Need immediate results
|
| 517 |
+
- Real-time user interactions
|
| 518 |
+
- Quick testing and demos
|
| 519 |
+
|
| 520 |
+
**When to use Async:**
|
| 521 |
+
- Generating 5+ documents
|
| 522 |
+
- Cost optimization (50% savings)
|
| 523 |
+
- Background/scheduled processing
|
| 524 |
+
- Large batch jobs
|
| 525 |
+
|
| 526 |
+
---
|
| 527 |
+
|
| 528 |
+
## Manual Testing with cURL
|
| 529 |
+
|
| 530 |
+
### Async API (`/generate/async`)
|
| 531 |
+
|
| 532 |
+
#### 1. Submit Async Job
|
| 533 |
+
|
| 534 |
+
```bash
|
| 535 |
+
curl -X POST http://localhost:8000/generate/async \
|
| 536 |
+
-H "Content-Type: application/json" \
|
| 537 |
+
-d '{
|
| 538 |
+
"user_id": 1,
|
| 539 |
+
"google_drive_token": "ya29.a0AfH6SMBx...",
|
| 540 |
+
"seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
|
| 541 |
+
"prompt_params": {
|
| 542 |
+
"language": "English",
|
| 543 |
+
"doc_type": "receipts",
|
| 544 |
+
"num_solutions": 2,
|
| 545 |
+
"enable_handwriting": false,
|
| 546 |
+
"enable_visual_elements": false,
|
| 547 |
+
"output_detail": "minimal"
|
| 548 |
+
}
|
| 549 |
+
}'
|
| 550 |
+
```
|
| 551 |
+
|
| 552 |
+
**Response:**
|
| 553 |
+
```json
|
| 554 |
+
{
|
| 555 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 556 |
+
"status": "queued",
|
| 557 |
+
"estimated_time_minutes": 10,
|
| 558 |
+
"poll_url": "/jobs/550e8400-e29b-41d4-a716-446655440000/status",
|
| 559 |
+
"created_at": "2026-02-28T12:00:00Z"
|
| 560 |
+
}
|
| 561 |
+
```
|
| 562 |
+
|
| 563 |
+
#### 2. Check Job Status
|
| 564 |
+
|
| 565 |
+
```bash
|
| 566 |
+
curl http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/status
|
| 567 |
+
```
|
| 568 |
+
|
| 569 |
+
**Response (Processing):**
|
| 570 |
+
```json
|
| 571 |
+
{
|
| 572 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 573 |
+
"status": "processing",
|
| 574 |
+
"created_at": "2026-02-28T12:00:00Z",
|
| 575 |
+
"updated_at": "2026-02-28T12:02:00Z",
|
| 576 |
+
"progress": "Creating batch request..."
|
| 577 |
+
}
|
| 578 |
+
```
|
| 579 |
+
|
| 580 |
+
**Response (Completed):**
|
| 581 |
+
```json
|
| 582 |
+
{
|
| 583 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 584 |
+
"status": "completed",
|
| 585 |
+
"created_at": "2026-02-28T12:00:00Z",
|
| 586 |
+
"updated_at": "2026-02-28T12:15:00Z",
|
| 587 |
+
"download_url": "https://drive.google.com/file/d/abc123xyz/view?usp=sharing",
|
| 588 |
+
"file_size_mb": 15.4,
|
| 589 |
+
"document_count": 2
|
| 590 |
+
}
|
| 591 |
+
```
|
| 592 |
+
|
| 593 |
+
#### 3. List User Jobs
|
| 594 |
+
|
| 595 |
+
```bash
|
| 596 |
+
curl "http://localhost:8000/jobs/user/1?limit=10&offset=0"
|
| 597 |
+
```
|
| 598 |
+
|
| 599 |
+
**Response:**
|
| 600 |
+
```json
|
| 601 |
+
{
|
| 602 |
+
"user_id": 1,
|
| 603 |
+
"jobs": [
|
| 604 |
+
{
|
| 605 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 606 |
+
"status": "completed",
|
| 607 |
+
"created_at": "2026-02-28T12:00:00Z",
|
| 608 |
+
"download_url": "https://drive.google.com/file/d/abc123xyz/view"
|
| 609 |
+
}
|
| 610 |
+
],
|
| 611 |
+
"count": 1,
|
| 612 |
+
"limit": 10,
|
| 613 |
+
"offset": 0
|
| 614 |
+
}
|
| 615 |
+
```
|
| 616 |
+
|
| 617 |
+
### Sync PDF API (`/generate/pdf`)
|
| 618 |
+
|
| 619 |
+
#### Mode 1: Quick Demo (No Tracking)
|
| 620 |
+
|
| 621 |
+
```bash
|
| 622 |
+
curl -X POST http://localhost:8000/generate/pdf \
|
| 623 |
+
-H "Content-Type: application/json" \
|
| 624 |
+
-d '{
|
| 625 |
+
"seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
|
| 626 |
+
"prompt_params": {
|
| 627 |
+
"language": "English",
|
| 628 |
+
"doc_type": "receipts",
|
| 629 |
+
"num_solutions": 1,
|
| 630 |
+
"enable_handwriting": false,
|
| 631 |
+
"enable_visual_elements": false,
|
| 632 |
+
"output_detail": "minimal"
|
| 633 |
+
}
|
| 634 |
+
}' \
|
| 635 |
+
--output documents.zip
|
| 636 |
+
```
|
| 637 |
+
|
| 638 |
+
**Response:**
|
| 639 |
+
- Returns ZIP file directly (binary)
|
| 640 |
+
- No tracking headers
|
| 641 |
+
- File saved as `documents.zip`
|
| 642 |
+
|
| 643 |
+
#### Mode 2: Demo with Tracking
|
| 644 |
+
|
| 645 |
+
```bash
|
| 646 |
+
curl -X POST http://localhost:8000/generate/pdf \
|
| 647 |
+
-H "Content-Type: application/json" \
|
| 648 |
+
-d '{
|
| 649 |
+
"user_id": 123,
|
| 650 |
+
"seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
|
| 651 |
+
"prompt_params": {
|
| 652 |
+
"language": "English",
|
| 653 |
+
"doc_type": "business documents",
|
| 654 |
+
"num_solutions": 2,
|
| 655 |
+
"enable_handwriting": false,
|
| 656 |
+
"output_detail": "minimal"
|
| 657 |
+
}
|
| 658 |
+
}' \
|
| 659 |
+
--output documents.zip \
|
| 660 |
+
-D headers.txt
|
| 661 |
+
```
|
| 662 |
+
|
| 663 |
+
**Response:**
|
| 664 |
+
- Returns ZIP file directly (binary)
|
| 665 |
+
- Headers saved to `headers.txt` contain:
|
| 666 |
+
- `X-Request-ID: 550e8400-e29b-41d4-a716-446655440000`
|
| 667 |
+
- `X-Status-URL: /jobs/550e8400-e29b-41d4-a716-446655440000/status`
|
| 668 |
+
|
| 669 |
+
**Check Status:**
|
| 670 |
+
```bash
|
| 671 |
+
# Extract request_id from headers.txt, then:
|
| 672 |
+
curl http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/status
|
| 673 |
+
```
|
| 674 |
+
|
| 675 |
+
#### Mode 3: Full Production (Tracking + GDrive)
|
| 676 |
+
|
| 677 |
+
```bash
|
| 678 |
+
curl -X POST http://localhost:8000/generate/pdf \
|
| 679 |
+
-H "Content-Type: application/json" \
|
| 680 |
+
-d '{
|
| 681 |
+
"user_id": 123,
|
| 682 |
+
"google_drive_token": "ya29.a0AfH6SMBx...",
|
| 683 |
+
"google_drive_refresh_token": "1//0gw...",
|
| 684 |
+
"seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
|
| 685 |
+
"prompt_params": {
|
| 686 |
+
"language": "English",
|
| 687 |
+
"doc_type": "invoices",
|
| 688 |
+
"num_solutions": 1,
|
| 689 |
+
"enable_handwriting": false,
|
| 690 |
+
"output_detail": "dataset"
|
| 691 |
+
}
|
| 692 |
+
}' \
|
| 693 |
+
--output documents.zip \
|
| 694 |
+
-D headers.txt
|
| 695 |
+
```
|
| 696 |
+
|
| 697 |
+
**Response:**
|
| 698 |
+
- Returns ZIP file immediately (binary)
|
| 699 |
+
- Google Drive upload happens in background
|
| 700 |
+
- Wait 10-30 seconds, then check status for GDrive URL:
|
| 701 |
+
|
| 702 |
+
```bash
|
| 703 |
+
curl http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/status
|
| 704 |
+
```
|
| 705 |
+
|
| 706 |
+
**Response (after background upload):**
|
| 707 |
+
```json
|
| 708 |
+
{
|
| 709 |
+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
| 710 |
+
"status": "completed",
|
| 711 |
+
"created_at": "2026-03-01T10:00:00Z",
|
| 712 |
+
"updated_at": "2026-03-01T10:00:45Z",
|
| 713 |
+
"results": {
|
| 714 |
+
"download_url": "https://drive.google.com/file/d/abc123xyz/view?usp=sharing",
|
| 715 |
+
"file_size_mb": 0.15,
|
| 716 |
+
"document_count": 1,
|
| 717 |
+
"zip_filename": "docgenie_550e8400-e29b-41d4-a716-446655440000.zip"
|
| 718 |
+
}
|
| 719 |
+
}
|
| 720 |
+
```
|
| 721 |
+
|
| 722 |
+
---
|
| 723 |
+
|
| 724 |
+
## Frontend Integration Example
|
| 725 |
+
|
| 726 |
+
### React + TypeScript
|
| 727 |
+
|
| 728 |
+
```typescript
|
| 729 |
+
import { useState, useEffect } from 'react';
|
| 730 |
+
|
| 731 |
+
interface JobStatus {
|
| 732 |
+
request_id: string;
|
| 733 |
+
status: 'queued' | 'processing' | 'generating' | 'completed' | 'failed';
|
| 734 |
+
download_url?: string;
|
| 735 |
+
error_message?: string;
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
function DocumentGenerator() {
|
| 739 |
+
const [jobId, setJobId] = useState<string | null>(null);
|
| 740 |
+
const [status, setStatus] = useState<JobStatus | null>(null);
|
| 741 |
+
const [googleToken, setGoogleToken] = useState<string>('');
|
| 742 |
+
|
| 743 |
+
// Step 1: Google OAuth (implement separately)
|
| 744 |
+
const handleGoogleAuth = async () => {
|
| 745 |
+
// Redirect to Google OAuth
|
| 746 |
+
const clientId = 'YOUR_CLIENT_ID';
|
| 747 |
+
const redirectUri = 'https://yourapp.com/auth/callback';
|
| 748 |
+
const scope = 'https://www.googleapis.com/auth/drive.file';
|
| 749 |
+
|
| 750 |
+
const authUrl = `https://accounts.google.com/o/oauth2/v2/auth?` +
|
| 751 |
+
`client_id=${clientId}&` +
|
| 752 |
+
`redirect_uri=${redirectUri}&` +
|
| 753 |
+
`response_type=code&` +
|
| 754 |
+
`scope=${scope}&` +
|
| 755 |
+
`access_type=offline&` +
|
| 756 |
+
`prompt=consent`;
|
| 757 |
+
|
| 758 |
+
window.location.href = authUrl;
|
| 759 |
+
};
|
| 760 |
+
|
| 761 |
+
// Step 2: Submit job
|
| 762 |
+
const handleGenerateDocuments = async () => {
|
| 763 |
+
const response = await fetch('http://localhost:8000/generate/async', {
|
| 764 |
+
method: 'POST',
|
| 765 |
+
headers: { 'Content-Type': 'application/json' },
|
| 766 |
+
body: JSON.stringify({
|
| 767 |
+
user_id: 1,
|
| 768 |
+
google_drive_token: googleToken,
|
| 769 |
+
seed_images: ['https://example.com/seed.jpg'],
|
| 770 |
+
prompt_params: {
|
| 771 |
+
language: 'English',
|
| 772 |
+
doc_type: 'receipts',
|
| 773 |
+
num_solutions: 3
|
| 774 |
+
}
|
| 775 |
+
})
|
| 776 |
+
});
|
| 777 |
+
|
| 778 |
+
const job = await response.json();
|
| 779 |
+
setJobId(job.request_id);
|
| 780 |
+
};
|
| 781 |
+
|
| 782 |
+
// Step 3: Poll status
|
| 783 |
+
useEffect(() => {
|
| 784 |
+
if (!jobId) return;
|
| 785 |
+
|
| 786 |
+
const interval = setInterval(async () => {
|
| 787 |
+
const response = await fetch(
|
| 788 |
+
`http://localhost:8000/jobs/${jobId}/status`
|
| 789 |
+
);
|
| 790 |
+
const data = await response.json();
|
| 791 |
+
setStatus(data);
|
| 792 |
+
|
| 793 |
+
if (data.status === 'completed' || data.status === 'failed') {
|
| 794 |
+
clearInterval(interval);
|
| 795 |
+
}
|
| 796 |
+
}, 30000); // Poll every 30 seconds
|
| 797 |
+
|
| 798 |
+
return () => clearInterval(interval);
|
| 799 |
+
}, [jobId]);
|
| 800 |
+
|
| 801 |
+
return (
|
| 802 |
+
<div>
|
| 803 |
+
{!googleToken ? (
|
| 804 |
+
<button onClick={handleGoogleAuth}>
|
| 805 |
+
Connect Google Drive
|
| 806 |
+
</button>
|
| 807 |
+
) : (
|
| 808 |
+
<button onClick={handleGenerateDocuments}>
|
| 809 |
+
Generate Documents
|
| 810 |
+
</button>
|
| 811 |
+
)}
|
| 812 |
+
|
| 813 |
+
{status && (
|
| 814 |
+
<div>
|
| 815 |
+
<p>Status: {status.status}</p>
|
| 816 |
+
{status.status === 'completed' && (
|
| 817 |
+
<a href={status.download_url} target="_blank">
|
| 818 |
+
Download Documents
|
| 819 |
+
</a>
|
| 820 |
+
)}
|
| 821 |
+
{status.status === 'failed' && (
|
| 822 |
+
<p>Error: {status.error_message}</p>
|
| 823 |
+
)}
|
| 824 |
+
</div>
|
| 825 |
+
)}
|
| 826 |
+
</div>
|
| 827 |
+
);
|
| 828 |
+
}
|
| 829 |
+
```
|
| 830 |
+
|
| 831 |
+
---
|
| 832 |
+
|
| 833 |
+
## Troubleshooting
|
| 834 |
+
|
| 835 |
+
### Issue: "google_drive_token is required"
|
| 836 |
+
|
| 837 |
+
**Cause**: No token provided in request
|
| 838 |
+
|
| 839 |
+
**Solution**:
|
| 840 |
+
```bash
|
| 841 |
+
# Make sure you're passing the token
|
| 842 |
+
python test_async_api.py --google-token "ya29.a0AfH6SMBx..."
|
| 843 |
+
```
|
| 844 |
+
|
| 845 |
+
### Issue: "Failed to refresh Google Drive token"
|
| 846 |
+
|
| 847 |
+
**Cause**: Token expired and no refresh token provided
|
| 848 |
+
|
| 849 |
+
**Solutions**:
|
| 850 |
+
1. Get a new token (tokens expire in ~1 hour)
|
| 851 |
+
2. Include refresh token in request
|
| 852 |
+
3. Frontend should refresh tokens automatically
|
| 853 |
+
|
| 854 |
+
### Issue: "Google Drive upload failed: insufficient permissions"
|
| 855 |
+
|
| 856 |
+
**Cause**: Token doesn't have drive.file scope
|
| 857 |
+
|
| 858 |
+
**Solution**: Re-authorize with correct scope:
|
| 859 |
+
```
|
| 860 |
+
https://www.googleapis.com/auth/drive.file
|
| 861 |
+
```
|
| 862 |
+
|
| 863 |
+
### Issue: Worker not processing jobs
|
| 864 |
+
|
| 865 |
+
**Check 1**: Is Redis running?
|
| 866 |
+
```bash
|
| 867 |
+
redis-cli ping # Should return "PONG"
|
| 868 |
+
```
|
| 869 |
+
|
| 870 |
+
**Check 2**: Is worker running?
|
| 871 |
+
```bash
|
| 872 |
+
# Check worker logs
|
| 873 |
+
journalctl -u docgenie-worker@1 -f
|
| 874 |
+
|
| 875 |
+
# Or check RQ info
|
| 876 |
+
rq info --url redis://localhost:6379/0
|
| 877 |
+
```
|
| 878 |
+
|
| 879 |
+
**Check 3**: Check failed queue
|
| 880 |
+
```bash
|
| 881 |
+
rq info --queue failed --url redis://localhost:6379/0
|
| 882 |
+
```
|
| 883 |
+
|
| 884 |
+
### Issue: Job stuck in "generating" status
|
| 885 |
+
|
| 886 |
+
**Cause**: Batch API taking longer than expected
|
| 887 |
+
|
| 888 |
+
**Solution**: Wait up to 30 minutes for batched requests. Check Anthropic dashboard:
|
| 889 |
+
https://console.anthropic.com/settings/batches
|
| 890 |
+
|
| 891 |
+
### Issue: Cannot access Google Drive link
|
| 892 |
+
|
| 893 |
+
**Cause**: File not shared properly
|
| 894 |
+
|
| 895 |
+
**Solution**: Check worker logs for sharing errors. File should have "anyone with link" permission.
|
| 896 |
+
|
| 897 |
+
---
|
| 898 |
+
|
| 899 |
+
## Performance Testing
|
| 900 |
+
|
| 901 |
+
### Test Batch API Cost Savings
|
| 902 |
+
|
| 903 |
+
```bash
|
| 904 |
+
# Generate 10 documents
|
| 905 |
+
time python test_async_api.py --google-token TOKEN --num-solutions 10
|
| 906 |
+
|
| 907 |
+
# Compare with direct API (for reference)
|
| 908 |
+
curl -X POST http://localhost:8000/generate \
|
| 909 |
+
-H "Content-Type: application/json" \
|
| 910 |
+
-d '{"seed_images": ["..."], "prompt_params": {"num_solutions": 10}}'
|
| 911 |
+
```
|
| 912 |
+
|
| 913 |
+
**Expected Results:**
|
| 914 |
+
- **Batched API**: 10-20 minutes, ~$2.50 per 1M tokens
|
| 915 |
+
- **Direct API**: 3-5 minutes, ~$5.00 per 1M tokens
|
| 916 |
+
- **Cost Savings**: 50%
|
| 917 |
+
|
| 918 |
+
---
|
| 919 |
+
|
| 920 |
+
## Next Steps
|
| 921 |
+
|
| 922 |
+
1. ✅ Test locally with script
|
| 923 |
+
2. ✅ Verify Google Drive upload
|
| 924 |
+
3. ✅ Test with your frontend
|
| 925 |
+
4. ✅ Deploy to production (see [DEPLOYMENT.md](DEPLOYMENT.md))
|
| 926 |
+
5. ✅ Set up monitoring (see [SCALING.md](SCALING.md))
|
| 927 |
+
|
| 928 |
+
---
|
| 929 |
+
|
| 930 |
+
## Additional Resources
|
| 931 |
+
|
| 932 |
+
- **API Documentation**: http://localhost:8000/docs
|
| 933 |
+
- **Deployment Guide**: [DEPLOYMENT.md](DEPLOYMENT.md)
|
| 934 |
+
- **Scaling Guide**: [SCALING.md](SCALING.md)
|
| 935 |
+
- **Google OAuth Docs**: https://developers.google.com/identity/protocols/oauth2
|
| 936 |
+
- **Anthropic Batch API**: https://docs.anthropic.com/en/docs/batch-api
|
api/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DocGenie FastAPI - REST API for document generation.
|
| 3 |
+
"""
|
| 4 |
+
__version__ = "1.0.0"
|
api/config.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration settings for DocGenie API
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from typing import Optional, List
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Settings:
|
| 9 |
+
"""API configuration settings"""
|
| 10 |
+
|
| 11 |
+
# ==================== LLM Configuration ====================
|
| 12 |
+
ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "")
|
| 13 |
+
CLAUDE_MODEL: str = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
|
| 14 |
+
# Backward compatibility
|
| 15 |
+
LLM_MODEL: str = os.getenv("LLM_MODEL", CLAUDE_MODEL)
|
| 16 |
+
|
| 17 |
+
# ==================== Handwriting Service (Stage 3) ====================
|
| 18 |
+
HANDWRITING_SERVICE_URL: str = os.getenv(
|
| 19 |
+
"HANDWRITING_SERVICE_URL",
|
| 20 |
+
"http://localhost:8080"
|
| 21 |
+
)
|
| 22 |
+
RUNPOD_API_KEY: str = os.getenv("RUNPOD_API_KEY", "")
|
| 23 |
+
HANDWRITING_SERVICE_TIMEOUT: int = int(os.getenv("HANDWRITING_SERVICE_TIMEOUT", "300"))
|
| 24 |
+
HANDWRITING_SERVICE_MAX_RETRIES: int = int(os.getenv("HANDWRITING_SERVICE_MAX_RETRIES", "3"))
|
| 25 |
+
HANDWRITING_SERVICE_ENABLED: bool = os.getenv("HANDWRITING_SERVICE_ENABLED", "false").lower() == "true"
|
| 26 |
+
HANDWRITING_APPLY_BLUR: bool = os.getenv("HANDWRITING_APPLY_BLUR", "false").lower() == "true"
|
| 27 |
+
|
| 28 |
+
# ==================== OCR Service (Stage 4) ====================
|
| 29 |
+
OCR_SERVICE_URL: str = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
|
| 30 |
+
OCR_SERVICE_TIMEOUT: int = int(os.getenv("OCR_SERVICE_TIMEOUT", "30"))
|
| 31 |
+
OCR_SERVICE_ENABLED: bool = os.getenv("OCR_SERVICE_ENABLED", "false").lower() == "true"
|
| 32 |
+
OCR_ENGINE: str = os.getenv("OCR_ENGINE", "microsoft_di")
|
| 33 |
+
OCR_DPI: int = int(os.getenv("OCR_DPI", "300")) # DPI for PDF to image conversion
|
| 34 |
+
|
| 35 |
+
# Local Tesseract OCR (alternative to remote service)
|
| 36 |
+
OCR_USE_LOCAL: bool = os.getenv("OCR_USE_LOCAL", "false").lower() == "true"
|
| 37 |
+
OCR_TESSERACT_LANG: str = os.getenv("OCR_TESSERACT_LANG", "eng") # Tesseract language
|
| 38 |
+
OCR_TESSERACT_CONFIG: str = os.getenv("OCR_TESSERACT_CONFIG", "--psm 3") # Tesseract config
|
| 39 |
+
|
| 40 |
+
# ==================== Stage 5: Dataset Packaging ====================
|
| 41 |
+
# Stage 16: BBox normalization
|
| 42 |
+
BBOX_NORMALIZATION_ENABLED: bool = os.getenv("BBOX_NORMALIZATION_ENABLED", "false").lower() == "true"
|
| 43 |
+
BBOX_NORMALIZATION_SCALE: str = os.getenv("BBOX_NORMALIZATION_SCALE", "0-1") # "0-1" or "0-1000"
|
| 44 |
+
|
| 45 |
+
# Stage 17: GT verification
|
| 46 |
+
GT_VERIFICATION_ENABLED: bool = os.getenv("GT_VERIFICATION_ENABLED", "false").lower() == "true"
|
| 47 |
+
GT_VERIFICATION_SIMILARITY_CUTOFF: float = float(os.getenv("GT_VERIFICATION_SIMILARITY_CUTOFF", "0.8"))
|
| 48 |
+
GT_VERIFICATION_OVERLAP_THRESHOLD: float = float(os.getenv("GT_VERIFICATION_OVERLAP_THRESHOLD", "0.5"))
|
| 49 |
+
|
| 50 |
+
# Stage 18: Analysis
|
| 51 |
+
ANALYSIS_ENABLED: bool = os.getenv("ANALYSIS_ENABLED", "false").lower() == "true"
|
| 52 |
+
ANALYSIS_MIN_ANNOTATION_COUNT: int = int(os.getenv("ANALYSIS_MIN_ANNOTATION_COUNT", "1"))
|
| 53 |
+
|
| 54 |
+
# Stage 19: Debug visualization
|
| 55 |
+
DEBUG_VISUALIZATION_ENABLED: bool = os.getenv("DEBUG_VISUALIZATION_ENABLED", "false").lower() == "true"
|
| 56 |
+
DEBUG_SHOW_TEXT_IN_BBOX: bool = os.getenv("DEBUG_SHOW_TEXT_IN_BBOX", "true").lower() == "true"
|
| 57 |
+
DEBUG_BBOX_COLOR_RGB: str = os.getenv("DEBUG_BBOX_COLOR_RGB", "255,0,0") # Red default
|
| 58 |
+
|
| 59 |
+
# Dataset export
|
| 60 |
+
DATASET_EXPORT_ENABLED: bool = os.getenv("DATASET_EXPORT_ENABLED", "false").lower() == "true"
|
| 61 |
+
DATASET_EXPORT_FORMAT: str = os.getenv("DATASET_EXPORT_FORMAT", "msgpack") # msgpack, coco, huggingface
|
| 62 |
+
DATASET_EXPORT_DIR: str = os.getenv("DATASET_EXPORT_DIR", "/tmp/docgenie_datasets")
|
| 63 |
+
DATASET_RESIZE_IMAGES: bool = os.getenv("DATASET_RESIZE_IMAGES", "false").lower() == "true"
|
| 64 |
+
DATASET_CLIP_BBOXES_TO_FOREGROUND: bool = os.getenv("DATASET_CLIP_BBOXES_TO_FOREGROUND", "false").lower() == "true"
|
| 65 |
+
|
| 66 |
+
# ==================== API Server Configuration ====================
|
| 67 |
+
API_HOST: str = os.getenv("API_HOST", "0.0.0.0")
|
| 68 |
+
API_PORT: int = int(os.getenv("API_PORT", "8000"))
|
| 69 |
+
DEBUG_MODE: bool = os.getenv("DEBUG_MODE", "false").lower() == "true"
|
| 70 |
+
|
| 71 |
+
# ==================== CORS Configuration ====================
|
| 72 |
+
CORS_ORIGINS: List[str] = [
|
| 73 |
+
origin.strip()
|
| 74 |
+
for origin in os.getenv("CORS_ORIGINS", "*").split(",")
|
| 75 |
+
if origin.strip()
|
| 76 |
+
] or ["*"]
|
| 77 |
+
|
| 78 |
+
# ==================== File Storage ====================
|
| 79 |
+
TEMP_DIR: str = os.getenv("TEMP_DIR", "/tmp/docgenie_api")
|
| 80 |
+
|
| 81 |
+
# ==================== Logging ====================
|
| 82 |
+
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
| 83 |
+
|
| 84 |
+
# ==================== Database (Optional) ====================
|
| 85 |
+
DATABASE_URL: Optional[str] = os.getenv("DATABASE_URL", None)
|
| 86 |
+
REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
| 87 |
+
|
| 88 |
+
# ==================== Supabase ====================
|
| 89 |
+
SUPABASE_URL: str = os.getenv("SUPABASE_URL", "")
|
| 90 |
+
SUPABASE_KEY: str = os.getenv("SUPABASE_KEY", "")
|
| 91 |
+
|
| 92 |
+
# ==================== Background Jobs ====================
|
| 93 |
+
RQ_QUEUE_NAME: str = os.getenv("RQ_QUEUE_NAME", "docgenie")
|
| 94 |
+
BATCH_POLL_INTERVAL: int = int(os.getenv("BATCH_POLL_INTERVAL", "30")) # seconds
|
| 95 |
+
BATCH_PROMPT_CHUNK_SIZE: int = int(os.getenv("BATCH_PROMPT_CHUNK_SIZE", "4")) # documents per prompt
|
| 96 |
+
BATCH_DATA_DIR: str = os.getenv("BATCH_DATA_DIR", "/tmp/docgenie_batches")
|
| 97 |
+
MESSAGE_DATA_DIR: str = os.getenv("MESSAGE_DATA_DIR", "/tmp/docgenie_messages")
|
| 98 |
+
|
| 99 |
+
# ==================== Google Drive ====================
|
| 100 |
+
GOOGLE_DRIVE_FOLDER_NAME: str = os.getenv("GOOGLE_DRIVE_FOLDER_NAME", "DocGenie Documents")
|
| 101 |
+
GOOGLE_CLIENT_ID: Optional[str] = os.getenv("GOOGLE_CLIENT_ID", None) # For token refresh only
|
| 102 |
+
GOOGLE_CLIENT_SECRET: Optional[str] = os.getenv("GOOGLE_CLIENT_SECRET", None) # For token refresh only
|
| 103 |
+
|
| 104 |
+
# ==================== Monitoring ====================
|
| 105 |
+
SENTRY_DSN: Optional[str] = os.getenv("SENTRY_DSN", None)
|
| 106 |
+
ENABLE_METRICS: bool = os.getenv("ENABLE_METRICS", "false").lower() == "true"
|
| 107 |
+
METRICS_PORT: int = int(os.getenv("METRICS_PORT", "9090"))
|
| 108 |
+
|
| 109 |
+
# ==================== AWS (Optional) ====================
|
| 110 |
+
AWS_ACCESS_KEY_ID: Optional[str] = os.getenv("AWS_ACCESS_KEY_ID", None)
|
| 111 |
+
AWS_SECRET_ACCESS_KEY: Optional[str] = os.getenv("AWS_SECRET_ACCESS_KEY", None)
|
| 112 |
+
AWS_REGION: str = os.getenv("AWS_REGION", "us-east-1")
|
| 113 |
+
S3_BUCKET: Optional[str] = os.getenv("S3_BUCKET", None)
|
| 114 |
+
|
| 115 |
+
@classmethod
|
| 116 |
+
def validate(cls) -> bool:
|
| 117 |
+
"""Validate required settings"""
|
| 118 |
+
if not cls.ANTHROPIC_API_KEY:
|
| 119 |
+
raise ValueError("ANTHROPIC_API_KEY environment variable is required")
|
| 120 |
+
return True
|
| 121 |
+
|
| 122 |
+
@classmethod
|
| 123 |
+
def get_cors_origins(cls) -> List[str]:
|
| 124 |
+
"""Get CORS origins list"""
|
| 125 |
+
return cls.CORS_ORIGINS if cls.CORS_ORIGINS != ["*"] else ["*"]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
settings = Settings()
|
api/dataset_exporter.py
ADDED
|
@@ -0,0 +1,733 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dataset Export Manager for DocGenie API
|
| 3 |
+
|
| 4 |
+
Handles organizing generated documents into a proper dataset structure
|
| 5 |
+
following the original pipeline's SyntheticDatasetFileStructure pattern.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import pathlib
|
| 9 |
+
import json
|
| 10 |
+
import base64
|
| 11 |
+
import shutil
|
| 12 |
+
from typing import Dict, List, Optional, Any
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class DatasetExporter:
|
| 16 |
+
"""
|
| 17 |
+
Manages export of generated documents to organized dataset structure.
|
| 18 |
+
|
| 19 |
+
Structure follows original pipeline pattern:
|
| 20 |
+
- Single msgpack for all documents
|
| 21 |
+
- Categorized folders (html/, pdf/, bbox/, etc.)
|
| 22 |
+
- Subfolders for per-document tokens
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, base_path: pathlib.Path, dataset_name: str = "docgenie_documents"):
|
| 26 |
+
"""
|
| 27 |
+
Initialize dataset exporter.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
base_path: Base directory for dataset export
|
| 31 |
+
dataset_name: Name of the dataset (will be subfolder name)
|
| 32 |
+
"""
|
| 33 |
+
self.base_path = base_path / dataset_name
|
| 34 |
+
self.dataset_name = dataset_name
|
| 35 |
+
self.documents = []
|
| 36 |
+
|
| 37 |
+
# Create directory structure
|
| 38 |
+
self._create_directory_structure()
|
| 39 |
+
|
| 40 |
+
def _create_directory_structure(self):
|
| 41 |
+
"""Create the organized directory structure."""
|
| 42 |
+
directories = [
|
| 43 |
+
# Root level
|
| 44 |
+
self.base_path,
|
| 45 |
+
|
| 46 |
+
# HTML files and CSS
|
| 47 |
+
self.html_dir,
|
| 48 |
+
|
| 49 |
+
# PDF stages
|
| 50 |
+
self.pdf_initial_dir,
|
| 51 |
+
self.pdf_with_handwriting_dir,
|
| 52 |
+
self.pdf_with_visual_elements_dir,
|
| 53 |
+
self.pdf_final_dir,
|
| 54 |
+
|
| 55 |
+
# Images
|
| 56 |
+
self.img_dir,
|
| 57 |
+
|
| 58 |
+
# Bounding boxes
|
| 59 |
+
self.bbox_pdf_word_dir,
|
| 60 |
+
self.bbox_pdf_char_dir,
|
| 61 |
+
self.bbox_final_word_dir,
|
| 62 |
+
self.bbox_final_segment_dir,
|
| 63 |
+
self.bbox_final_normalized_word_dir,
|
| 64 |
+
self.bbox_final_normalized_segment_dir,
|
| 65 |
+
|
| 66 |
+
# Annotations
|
| 67 |
+
self.raw_annotations_dir,
|
| 68 |
+
self.gt_dir,
|
| 69 |
+
self.gt_verification_dir,
|
| 70 |
+
self.token_mapping_dir,
|
| 71 |
+
|
| 72 |
+
# Handwriting
|
| 73 |
+
self.handwriting_regions_dir,
|
| 74 |
+
self.handwriting_tokens_dir,
|
| 75 |
+
|
| 76 |
+
# Visual elements
|
| 77 |
+
self.visual_element_definitions_dir,
|
| 78 |
+
self.visual_element_images_dir,
|
| 79 |
+
|
| 80 |
+
# Layout elements
|
| 81 |
+
self.layout_dir,
|
| 82 |
+
|
| 83 |
+
# Geometries
|
| 84 |
+
self.geometries_dir,
|
| 85 |
+
|
| 86 |
+
# OCR results
|
| 87 |
+
self.ocr_results_dir,
|
| 88 |
+
|
| 89 |
+
# Analysis
|
| 90 |
+
self.analysis_dir,
|
| 91 |
+
|
| 92 |
+
# Debug visualizations
|
| 93 |
+
self.debug_dir,
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
for directory in directories:
|
| 97 |
+
directory.mkdir(parents=True, exist_ok=True)
|
| 98 |
+
|
| 99 |
+
# ==================== Directory Properties ====================
|
| 100 |
+
|
| 101 |
+
@property
|
| 102 |
+
def html_dir(self) -> pathlib.Path:
|
| 103 |
+
"""HTML and CSS files"""
|
| 104 |
+
return self.base_path / "html"
|
| 105 |
+
|
| 106 |
+
@property
|
| 107 |
+
def pdf_initial_dir(self) -> pathlib.Path:
|
| 108 |
+
"""PDFs before any synthesis"""
|
| 109 |
+
return self.base_path / "pdf" / "pdf_initial"
|
| 110 |
+
|
| 111 |
+
@property
|
| 112 |
+
def pdf_with_handwriting_dir(self) -> pathlib.Path:
|
| 113 |
+
"""PDFs with only handwriting added"""
|
| 114 |
+
return self.base_path / "pdf" / "pdf_with_handwriting"
|
| 115 |
+
|
| 116 |
+
@property
|
| 117 |
+
def pdf_with_visual_elements_dir(self) -> pathlib.Path:
|
| 118 |
+
"""PDFs with only visual elements added"""
|
| 119 |
+
return self.base_path / "pdf" / "pdf_with_visual_elements"
|
| 120 |
+
|
| 121 |
+
@property
|
| 122 |
+
def pdf_final_dir(self) -> pathlib.Path:
|
| 123 |
+
"""PDFs with both handwriting and visual elements"""
|
| 124 |
+
return self.base_path / "pdf" / "pdf_final"
|
| 125 |
+
|
| 126 |
+
@property
|
| 127 |
+
def img_dir(self) -> pathlib.Path:
|
| 128 |
+
"""Final rendered images"""
|
| 129 |
+
return self.base_path / "img"
|
| 130 |
+
|
| 131 |
+
@property
|
| 132 |
+
def bbox_pdf_word_dir(self) -> pathlib.Path:
|
| 133 |
+
"""Word-level bounding boxes extracted from PDF (ground truth positions)"""
|
| 134 |
+
return self.base_path / "bbox" / "bbox_pdf" / "word"
|
| 135 |
+
|
| 136 |
+
@property
|
| 137 |
+
def bbox_pdf_char_dir(self) -> pathlib.Path:
|
| 138 |
+
"""Character-level bounding boxes extracted from PDF"""
|
| 139 |
+
return self.base_path / "bbox" / "bbox_pdf" / "char"
|
| 140 |
+
|
| 141 |
+
@property
|
| 142 |
+
def bbox_final_word_dir(self) -> pathlib.Path:
|
| 143 |
+
"""Final word-level bounding boxes (from OCR if modifications applied, else from PDF)"""
|
| 144 |
+
return self.base_path / "bbox" / "bbox_final" / "word"
|
| 145 |
+
|
| 146 |
+
@property
|
| 147 |
+
def bbox_final_segment_dir(self) -> pathlib.Path:
|
| 148 |
+
"""Final segment-level bounding boxes (from OCR if modifications applied, else from PDF)"""
|
| 149 |
+
return self.base_path / "bbox" / "bbox_final" / "segment"
|
| 150 |
+
|
| 151 |
+
@property
|
| 152 |
+
def bbox_final_normalized_word_dir(self) -> pathlib.Path:
|
| 153 |
+
"""Normalized word-level bounding boxes"""
|
| 154 |
+
return self.base_path / "bbox" / "bbox_final_normalized" / "word"
|
| 155 |
+
|
| 156 |
+
@property
|
| 157 |
+
def bbox_final_normalized_segment_dir(self) -> pathlib.Path:
|
| 158 |
+
"""Normalized segment-level bounding boxes"""
|
| 159 |
+
return self.base_path / "bbox" / "bbox_final_normalized" / "segment"
|
| 160 |
+
|
| 161 |
+
@property
|
| 162 |
+
def raw_annotations_dir(self) -> pathlib.Path:
|
| 163 |
+
"""Raw annotations (layout boxes before normalization)"""
|
| 164 |
+
return self.base_path / "annotations" / "raw_annotations"
|
| 165 |
+
|
| 166 |
+
@property
|
| 167 |
+
def gt_dir(self) -> pathlib.Path:
|
| 168 |
+
"""Ground truth annotations"""
|
| 169 |
+
return self.base_path / "annotations" / "gt"
|
| 170 |
+
|
| 171 |
+
@property
|
| 172 |
+
def gt_verification_dir(self) -> pathlib.Path:
|
| 173 |
+
"""Ground truth verification results"""
|
| 174 |
+
return self.base_path / "annotations" / "gt_verification"
|
| 175 |
+
|
| 176 |
+
@property
|
| 177 |
+
def token_mapping_dir(self) -> pathlib.Path:
|
| 178 |
+
"""Token mapping files"""
|
| 179 |
+
return self.base_path / "annotations" / "token_mapping"
|
| 180 |
+
|
| 181 |
+
@property
|
| 182 |
+
def handwriting_regions_dir(self) -> pathlib.Path:
|
| 183 |
+
"""Handwriting region definitions"""
|
| 184 |
+
return self.base_path / "handwriting" / "handwriting_regions"
|
| 185 |
+
|
| 186 |
+
@property
|
| 187 |
+
def handwriting_tokens_dir(self) -> pathlib.Path:
|
| 188 |
+
"""Handwriting token images (per-document subfolders)"""
|
| 189 |
+
return self.base_path / "handwriting" / "handwriting_tokens"
|
| 190 |
+
|
| 191 |
+
@property
|
| 192 |
+
def visual_element_definitions_dir(self) -> pathlib.Path:
|
| 193 |
+
"""Visual element definitions"""
|
| 194 |
+
return self.base_path / "visual_elements" / "visual_element_definitions"
|
| 195 |
+
|
| 196 |
+
@property
|
| 197 |
+
def visual_element_images_dir(self) -> pathlib.Path:
|
| 198 |
+
"""Visual element images (per-document subfolders)"""
|
| 199 |
+
return self.base_path / "visual_elements" / "visual_element_images"
|
| 200 |
+
|
| 201 |
+
@property
|
| 202 |
+
def layout_dir(self) -> pathlib.Path:
|
| 203 |
+
"""Layout element definitions"""
|
| 204 |
+
return self.base_path / "layout"
|
| 205 |
+
|
| 206 |
+
@property
|
| 207 |
+
def geometries_dir(self) -> pathlib.Path:
|
| 208 |
+
"""Extracted geometries from HTML"""
|
| 209 |
+
return self.base_path / "geometries"
|
| 210 |
+
|
| 211 |
+
@property
|
| 212 |
+
def ocr_results_dir(self) -> pathlib.Path:
|
| 213 |
+
"""OCR results"""
|
| 214 |
+
return self.base_path / "ocr_results"
|
| 215 |
+
|
| 216 |
+
@property
|
| 217 |
+
def analysis_dir(self) -> pathlib.Path:
|
| 218 |
+
"""Analysis statistics"""
|
| 219 |
+
return self.base_path / "analysis"
|
| 220 |
+
|
| 221 |
+
@property
|
| 222 |
+
def debug_dir(self) -> pathlib.Path:
|
| 223 |
+
"""Debug visualizations"""
|
| 224 |
+
return self.base_path / "debug"
|
| 225 |
+
|
| 226 |
+
@property
|
| 227 |
+
def msgpack_path(self) -> pathlib.Path:
|
| 228 |
+
"""
|
| 229 |
+
Path to the dataset msgpack file.
|
| 230 |
+
|
| 231 |
+
This file aggregates all documents in the dataset into a single msgpack
|
| 232 |
+
for efficient loading during ML training.
|
| 233 |
+
"""
|
| 234 |
+
return self.base_path / "dataset.msgpack"
|
| 235 |
+
|
| 236 |
+
@property
|
| 237 |
+
def metadata_path(self) -> pathlib.Path:
|
| 238 |
+
"""Path to dataset metadata JSON"""
|
| 239 |
+
return self.base_path / "metadata.json"
|
| 240 |
+
|
| 241 |
+
# ==================== Export Methods ====================
|
| 242 |
+
|
| 243 |
+
def add_document(
|
| 244 |
+
self,
|
| 245 |
+
document_id: str,
|
| 246 |
+
html: str,
|
| 247 |
+
css: str,
|
| 248 |
+
pdf_initial: Optional[bytes] = None,
|
| 249 |
+
pdf_with_handwriting: Optional[bytes] = None,
|
| 250 |
+
pdf_with_visual_elements: Optional[bytes] = None,
|
| 251 |
+
pdf_final: Optional[bytes] = None,
|
| 252 |
+
final_image: Optional[bytes] = None,
|
| 253 |
+
ground_truth: Optional[dict] = None,
|
| 254 |
+
raw_annotations: Optional[list] = None,
|
| 255 |
+
bboxes_pdf_word: Optional[list] = None,
|
| 256 |
+
bboxes_pdf_char: Optional[list] = None,
|
| 257 |
+
bboxes_final_word: Optional[list] = None,
|
| 258 |
+
bboxes_final_segment: Optional[list] = None,
|
| 259 |
+
bboxes_normalized_word: Optional[dict] = None,
|
| 260 |
+
bboxes_normalized_segment: Optional[dict] = None,
|
| 261 |
+
gt_verification: Optional[dict] = None,
|
| 262 |
+
token_mapping: Optional[dict] = None,
|
| 263 |
+
handwriting_regions: Optional[list] = None,
|
| 264 |
+
handwriting_images: Optional[dict] = None, # {hw_id: base64_png}
|
| 265 |
+
visual_elements: Optional[list] = None,
|
| 266 |
+
visual_element_images: Optional[dict] = None, # {ve_id: base64_png}
|
| 267 |
+
layout_elements: Optional[list] = None,
|
| 268 |
+
geometries: Optional[list] = None, # List of element geometry dicts
|
| 269 |
+
ocr_results: Optional[dict] = None,
|
| 270 |
+
analysis_stats: Optional[dict] = None,
|
| 271 |
+
debug_visualization: Optional[bytes] = None,
|
| 272 |
+
):
|
| 273 |
+
"""
|
| 274 |
+
Add a document to the dataset export.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
document_id: Unique document identifier
|
| 278 |
+
html: Document HTML content
|
| 279 |
+
css: Document CSS content
|
| 280 |
+
pdf_initial: Initial PDF bytes (before modifications)
|
| 281 |
+
pdf_with_handwriting: PDF bytes after handwriting insertion
|
| 282 |
+
pdf_with_visual_elements: PDF bytes after visual element insertion (no handwriting)
|
| 283 |
+
pdf_final: PDF bytes with both handwriting and visual elements
|
| 284 |
+
final_image: Final rendered image (PNG bytes)
|
| 285 |
+
ground_truth: Ground truth annotations
|
| 286 |
+
raw_annotations: Raw layout boxes (before normalization)
|
| 287 |
+
bboxes_pdf_word: Word-level bboxes from PDF (ground truth)
|
| 288 |
+
bboxes_pdf_char: Character-level bboxes from PDF
|
| 289 |
+
bboxes_final_word: Final word-level bboxes (OCR or PDF)
|
| 290 |
+
bboxes_final_segment: Final segment-level bboxes (OCR or PDF)
|
| 291 |
+
bboxes_normalized_word: Normalized word-level bboxes
|
| 292 |
+
bboxes_normalized_segment: Normalized segment-level bboxes
|
| 293 |
+
gt_verification: Ground truth verification results
|
| 294 |
+
token_mapping: Token to bbox mapping
|
| 295 |
+
handwriting_regions: Handwriting region metadata
|
| 296 |
+
handwriting_images: Dict of handwriting token images
|
| 297 |
+
visual_elements: Visual element metadata
|
| 298 |
+
visual_element_images: Dict of visual element images
|
| 299 |
+
layout_elements: Layout element definitions
|
| 300 |
+
geometries: Extracted geometries from HTML
|
| 301 |
+
ocr_results: OCR results
|
| 302 |
+
analysis_stats: Analysis statistics
|
| 303 |
+
debug_visualization: Debug visualization image (PNG bytes)
|
| 304 |
+
"""
|
| 305 |
+
# Save HTML and CSS
|
| 306 |
+
(self.html_dir / f"{document_id}.html").write_text(html, encoding='utf-8')
|
| 307 |
+
(self.html_dir / f"{document_id}.css").write_text(css, encoding='utf-8')
|
| 308 |
+
|
| 309 |
+
# Save all PDF stages
|
| 310 |
+
if pdf_initial:
|
| 311 |
+
(self.pdf_initial_dir / f"{document_id}.pdf").write_bytes(pdf_initial)
|
| 312 |
+
|
| 313 |
+
if pdf_with_handwriting:
|
| 314 |
+
(self.pdf_with_handwriting_dir / f"{document_id}.pdf").write_bytes(pdf_with_handwriting)
|
| 315 |
+
|
| 316 |
+
if pdf_with_visual_elements:
|
| 317 |
+
(self.pdf_with_visual_elements_dir / f"{document_id}.pdf").write_bytes(pdf_with_visual_elements)
|
| 318 |
+
|
| 319 |
+
if pdf_final:
|
| 320 |
+
(self.pdf_final_dir / f"{document_id}.pdf").write_bytes(pdf_final)
|
| 321 |
+
|
| 322 |
+
# Save final image
|
| 323 |
+
if final_image:
|
| 324 |
+
(self.img_dir / f"{document_id}.png").write_bytes(final_image)
|
| 325 |
+
|
| 326 |
+
# Save annotations
|
| 327 |
+
if raw_annotations:
|
| 328 |
+
(self.raw_annotations_dir / f"{document_id}.json").write_text(
|
| 329 |
+
json.dumps(raw_annotations, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
if ground_truth:
|
| 333 |
+
(self.gt_dir / f"{document_id}.json").write_text(
|
| 334 |
+
json.dumps(ground_truth, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
if gt_verification:
|
| 338 |
+
(self.gt_verification_dir / f"{document_id}.json").write_text(
|
| 339 |
+
json.dumps(gt_verification, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
if token_mapping:
|
| 343 |
+
(self.token_mapping_dir / f"{document_id}.json").write_text(
|
| 344 |
+
json.dumps(token_mapping, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# Save bounding boxes
|
| 348 |
+
if bboxes_pdf_word:
|
| 349 |
+
(self.bbox_pdf_word_dir / f"{document_id}.json").write_text(
|
| 350 |
+
json.dumps(bboxes_pdf_word, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
if bboxes_pdf_char:
|
| 354 |
+
(self.bbox_pdf_char_dir / f"{document_id}.json").write_text(
|
| 355 |
+
json.dumps(bboxes_pdf_char, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
if bboxes_final_word:
|
| 359 |
+
(self.bbox_final_word_dir / f"{document_id}.json").write_text(
|
| 360 |
+
json.dumps(bboxes_final_word, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
if bboxes_final_segment:
|
| 364 |
+
(self.bbox_final_segment_dir / f"{document_id}.json").write_text(
|
| 365 |
+
json.dumps(bboxes_final_segment, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
if bboxes_normalized_word:
|
| 369 |
+
(self.bbox_final_normalized_word_dir / f"{document_id}.json").write_text(
|
| 370 |
+
json.dumps(bboxes_normalized_word, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
if bboxes_normalized_segment:
|
| 374 |
+
(self.bbox_final_normalized_segment_dir / f"{document_id}.json").write_text(
|
| 375 |
+
json.dumps(bboxes_normalized_segment, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# Save handwriting data
|
| 379 |
+
if handwriting_regions:
|
| 380 |
+
(self.handwriting_regions_dir / f"{document_id}.json").write_text(
|
| 381 |
+
json.dumps(handwriting_regions, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
if handwriting_images:
|
| 385 |
+
# Create subfolder for this document's tokens
|
| 386 |
+
doc_hw_tokens_dir = self.handwriting_tokens_dir / document_id
|
| 387 |
+
doc_hw_tokens_dir.mkdir(parents=True, exist_ok=True)
|
| 388 |
+
|
| 389 |
+
for hw_id, img_b64 in handwriting_images.items():
|
| 390 |
+
img_bytes = base64.b64decode(img_b64)
|
| 391 |
+
(doc_hw_tokens_dir / f"{hw_id}.png").write_bytes(img_bytes)
|
| 392 |
+
|
| 393 |
+
# Save visual element data
|
| 394 |
+
if visual_elements:
|
| 395 |
+
(self.visual_element_definitions_dir / f"{document_id}.json").write_text(
|
| 396 |
+
json.dumps(visual_elements, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
if visual_element_images:
|
| 400 |
+
# Create subfolder for this document's visual elements
|
| 401 |
+
doc_ve_images_dir = self.visual_element_images_dir / document_id
|
| 402 |
+
doc_ve_images_dir.mkdir(parents=True, exist_ok=True)
|
| 403 |
+
|
| 404 |
+
for ve_id, img_b64 in visual_element_images.items():
|
| 405 |
+
img_bytes = base64.b64decode(img_b64)
|
| 406 |
+
(doc_ve_images_dir / f"{ve_id}.png").write_bytes(img_bytes)
|
| 407 |
+
|
| 408 |
+
# Save other data
|
| 409 |
+
if layout_elements:
|
| 410 |
+
(self.layout_dir / f"{document_id}.json").write_text(
|
| 411 |
+
json.dumps(layout_elements, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
if geometries:
|
| 415 |
+
(self.geometries_dir / f"{document_id}.json").write_text(
|
| 416 |
+
json.dumps(geometries, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
if ocr_results:
|
| 420 |
+
(self.ocr_results_dir / f"{document_id}.json").write_text(
|
| 421 |
+
json.dumps(ocr_results, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
if analysis_stats:
|
| 425 |
+
(self.analysis_dir / f"{document_id}.json").write_text(
|
| 426 |
+
json.dumps(analysis_stats, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
if debug_visualization:
|
| 430 |
+
(self.debug_dir / f"{document_id}_debug.png").write_bytes(debug_visualization)
|
| 431 |
+
|
| 432 |
+
# Track document for metadata
|
| 433 |
+
self.documents.append({
|
| 434 |
+
'document_id': document_id,
|
| 435 |
+
'has_handwriting': handwriting_regions is not None and len(handwriting_regions) > 0,
|
| 436 |
+
'has_visual_elements': visual_elements is not None and len(visual_elements) > 0,
|
| 437 |
+
'has_ocr': ocr_results is not None,
|
| 438 |
+
'modification_type': (
|
| 439 |
+
"both" if pdf_final
|
| 440 |
+
else "handwriting" if pdf_with_handwriting
|
| 441 |
+
else "visual_elements" if pdf_with_visual_elements
|
| 442 |
+
else None
|
| 443 |
+
)
|
| 444 |
+
})
|
| 445 |
+
|
| 446 |
+
def finalize(
|
| 447 |
+
self,
|
| 448 |
+
request_id: Optional[str] = None,
|
| 449 |
+
user_id: Optional[int] = None,
|
| 450 |
+
prompt_params: Optional[dict] = None,
|
| 451 |
+
api_mode: str = "sync"
|
| 452 |
+
) -> pathlib.Path:
|
| 453 |
+
"""
|
| 454 |
+
Finalize the dataset export by creating metadata, README, and optionally msgpack.
|
| 455 |
+
|
| 456 |
+
Args:
|
| 457 |
+
request_id: Request UUID for tracking
|
| 458 |
+
user_id: User ID who made the request
|
| 459 |
+
prompt_params: Prompt parameters used for generation
|
| 460 |
+
api_mode: "sync" or "async"
|
| 461 |
+
|
| 462 |
+
Returns:
|
| 463 |
+
Path to the dataset base directory
|
| 464 |
+
"""
|
| 465 |
+
# Create metadata
|
| 466 |
+
metadata = {
|
| 467 |
+
'dataset_name': self.dataset_name,
|
| 468 |
+
'num_documents': len(self.documents),
|
| 469 |
+
'documents': self.documents,
|
| 470 |
+
'structure_version': '2.0',
|
| 471 |
+
'structure_description': 'Organized dataset following original pipeline structure',
|
| 472 |
+
'generation_metadata': {
|
| 473 |
+
'request_id': request_id,
|
| 474 |
+
'user_id': user_id,
|
| 475 |
+
'api_mode': api_mode,
|
| 476 |
+
'prompt_params': prompt_params or {}
|
| 477 |
+
}
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
self.metadata_path.write_text(
|
| 481 |
+
json.dumps(metadata, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
# Create README
|
| 485 |
+
readme_content = self._generate_readme()
|
| 486 |
+
(self.base_path / "README.md").write_text(readme_content, encoding='utf-8')
|
| 487 |
+
|
| 488 |
+
# Create msgpack dataset only if explicitly enabled
|
| 489 |
+
enable_dataset_export = prompt_params.get('enable_dataset_export', False) if prompt_params else False
|
| 490 |
+
dataset_export_format = prompt_params.get('dataset_export_format', 'msgpack') if prompt_params else 'msgpack'
|
| 491 |
+
|
| 492 |
+
if enable_dataset_export and dataset_export_format.lower() == 'msgpack':
|
| 493 |
+
# Also check if bbox normalization was enabled (required for msgpack)
|
| 494 |
+
enable_bbox_normalization = prompt_params.get('enable_bbox_normalization', False) if prompt_params else False
|
| 495 |
+
|
| 496 |
+
if enable_bbox_normalization:
|
| 497 |
+
self._create_msgpack_dataset()
|
| 498 |
+
else:
|
| 499 |
+
print(f" ⚠ Msgpack export requested but bbox_normalization is disabled")
|
| 500 |
+
print(f" Msgpack requires normalized bboxes. Enable 'enable_bbox_normalization: true' to export msgpack.")
|
| 501 |
+
|
| 502 |
+
return self.base_path
|
| 503 |
+
|
| 504 |
+
def _create_msgpack_dataset(self):
|
| 505 |
+
"""
|
| 506 |
+
Create a single msgpack file aggregating all documents.
|
| 507 |
+
|
| 508 |
+
This follows the original pipeline's approach of creating one msgpack
|
| 509 |
+
with all documents for easy loading in ML training pipelines.
|
| 510 |
+
"""
|
| 511 |
+
try:
|
| 512 |
+
from datadings.writer import FileWriter
|
| 513 |
+
|
| 514 |
+
print(f" 📦 Creating msgpack dataset...")
|
| 515 |
+
|
| 516 |
+
# Collect all samples
|
| 517 |
+
samples = []
|
| 518 |
+
for doc in self.documents:
|
| 519 |
+
doc_id = doc['document_id']
|
| 520 |
+
|
| 521 |
+
# Read normalized bboxes (required for msgpack)
|
| 522 |
+
bbox_word_path = self.bbox_final_normalized_word_dir / f"{doc_id}.json"
|
| 523 |
+
bbox_segment_path = self.bbox_final_normalized_segment_dir / f"{doc_id}.json"
|
| 524 |
+
|
| 525 |
+
# Skip if bboxes don't exist
|
| 526 |
+
if not bbox_word_path.exists():
|
| 527 |
+
print(f" ⚠ Skipping {doc_id}: no normalized bboxes found")
|
| 528 |
+
continue
|
| 529 |
+
|
| 530 |
+
# Read word bboxes
|
| 531 |
+
word_bboxes_data = json.loads(bbox_word_path.read_text(encoding='utf-8'))
|
| 532 |
+
|
| 533 |
+
# Read segment bboxes (fallback to word if not available)
|
| 534 |
+
if bbox_segment_path.exists():
|
| 535 |
+
segment_bboxes_data = json.loads(bbox_segment_path.read_text(encoding='utf-8'))
|
| 536 |
+
else:
|
| 537 |
+
segment_bboxes_data = word_bboxes_data
|
| 538 |
+
|
| 539 |
+
# Extract words and bboxes
|
| 540 |
+
words = [item['text'] for item in word_bboxes_data if 'text' in item]
|
| 541 |
+
word_bboxes = [item['bbox'] for item in word_bboxes_data if 'bbox' in item]
|
| 542 |
+
segment_bboxes = [item['bbox'] for item in segment_bboxes_data if 'bbox' in item]
|
| 543 |
+
|
| 544 |
+
# Ensure bbox format is [x0, y0, x2, y2] (normalized)
|
| 545 |
+
# If dict format, convert
|
| 546 |
+
word_bboxes = [
|
| 547 |
+
bbox if isinstance(bbox, list) else [bbox['x0'], bbox['y0'], bbox['x2'], bbox['y2']]
|
| 548 |
+
for bbox in word_bboxes
|
| 549 |
+
]
|
| 550 |
+
segment_bboxes = [
|
| 551 |
+
bbox if isinstance(bbox, list) else [bbox['x0'], bbox['y0'], bbox['x2'], bbox['y2']]
|
| 552 |
+
for bbox in segment_bboxes
|
| 553 |
+
]
|
| 554 |
+
|
| 555 |
+
# Read ground truth
|
| 556 |
+
gt_path = self.gt_dir / f"{doc_id}.json"
|
| 557 |
+
annotations = {}
|
| 558 |
+
if gt_path.exists():
|
| 559 |
+
annotations = json.loads(gt_path.read_text(encoding='utf-8'))
|
| 560 |
+
|
| 561 |
+
# Determine image file path
|
| 562 |
+
img_path = self.img_dir / f"{doc_id}.png"
|
| 563 |
+
if not img_path.exists():
|
| 564 |
+
# Fallback to PDF
|
| 565 |
+
img_path = self.pdf_final_dir / f"{doc_id}.pdf"
|
| 566 |
+
if not img_path.exists():
|
| 567 |
+
img_path = self.pdf_initial_dir / f"{doc_id}.pdf"
|
| 568 |
+
|
| 569 |
+
# Create sample dictionary matching original pipeline format
|
| 570 |
+
sample = {
|
| 571 |
+
'key': doc_id,
|
| 572 |
+
'sample_id': doc_id,
|
| 573 |
+
'image_file_path': str(img_path),
|
| 574 |
+
'words': words,
|
| 575 |
+
'word_bboxes': word_bboxes,
|
| 576 |
+
'segment_level_bboxes': segment_bboxes,
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
# Add annotations if present
|
| 580 |
+
if annotations:
|
| 581 |
+
sample.update(annotations)
|
| 582 |
+
|
| 583 |
+
samples.append(sample)
|
| 584 |
+
|
| 585 |
+
if not samples:
|
| 586 |
+
print(f" ⚠ No samples to write to msgpack - skipping")
|
| 587 |
+
return
|
| 588 |
+
|
| 589 |
+
# Write all samples to msgpack
|
| 590 |
+
with FileWriter(self.msgpack_path, overwrite=True) as writer:
|
| 591 |
+
for sample in samples:
|
| 592 |
+
writer.write(sample)
|
| 593 |
+
|
| 594 |
+
print(f" ✓ Created msgpack dataset: {self.msgpack_path.name} ({len(samples)} documents)")
|
| 595 |
+
|
| 596 |
+
except ImportError:
|
| 597 |
+
print(f" ⚠ datadings not installed - skipping msgpack creation")
|
| 598 |
+
print(f" Install with: pip install datadings")
|
| 599 |
+
except Exception as e:
|
| 600 |
+
print(f" ⚠ Failed to create msgpack: {str(e)}")
|
| 601 |
+
import traceback
|
| 602 |
+
traceback.print_exc()
|
| 603 |
+
|
| 604 |
+
def _generate_readme(self) -> str:
|
| 605 |
+
"""Generate README content for the dataset."""
|
| 606 |
+
return f"""# DocGenie Dataset: {self.dataset_name}
|
| 607 |
+
|
| 608 |
+
Generated using DocGenie API - Synthetic Document Generation Pipeline
|
| 609 |
+
|
| 610 |
+
## Dataset Structure
|
| 611 |
+
|
| 612 |
+
This dataset follows the original pipeline's organized structure with categorized folders:
|
| 613 |
+
|
| 614 |
+
```
|
| 615 |
+
{self.dataset_name}/
|
| 616 |
+
├── dataset.msgpack # Aggregated dataset (all documents)
|
| 617 |
+
├── metadata.json # Dataset metadata
|
| 618 |
+
├── README.md # This file
|
| 619 |
+
│
|
| 620 |
+
├── html/ # HTML and CSS files
|
| 621 |
+
│ ├── document_1.html
|
| 622 |
+
│ ├── document_1.css
|
| 623 |
+
│ └── ...
|
| 624 |
+
│
|
| 625 |
+
├── pdf/ # PDF files at different stages
|
| 626 |
+
│ ├── pdf_initial/ # Before synthesis
|
| 627 |
+
│ ├── pdf_with_handwriting/ # With handwriting only
|
| 628 |
+
│ ├── pdf_with_visual_elements/ # With visual elements only
|
| 629 |
+
│ └── pdf_final/ # With both features
|
| 630 |
+
│
|
| 631 |
+
├── img/ # Final rendered images
|
| 632 |
+
│ ├── document_1.png
|
| 633 |
+
│ └── ...
|
| 634 |
+
│
|
| 635 |
+
├── bbox/ # Bounding boxes
|
| 636 |
+
│ ├── bbox_pdf/ # Extracted from PDF (ground truth positions)
|
| 637 |
+
│ │ ├── word/ # Word-level from PDF
|
| 638 |
+
│ │ └── char/ # Character-level from PDF
|
| 639 |
+
│ ├── bbox_final/ # Final bboxes (OCR if modified, else PDF)
|
| 640 |
+
│ │ ├── word/ # Word-level (unnormalized)
|
| 641 |
+
│ │ └── segment/ # Segment-level (unnormalized)
|
| 642 |
+
│ └── bbox_final_normalized/ # Normalized (0-1 range)
|
| 643 |
+
│ ├── word/ # Word-level normalized
|
| 644 |
+
│ └── segment/ # Segment-level normalized
|
| 645 |
+
│
|
| 646 |
+
├── annotations/ # Ground truth and mappings
|
| 647 |
+
│ ├── raw_annotations/ # Raw layout boxes (before normalization)
|
| 648 |
+
│ ├── gt/ # Ground truth annotations
|
| 649 |
+
│ ├── gt_verification/ # Verification results
|
| 650 |
+
│ └── token_mapping/ # Token-to-bbox mappings
|
| 651 |
+
│
|
| 652 |
+
├── handwriting/ # Handwriting data
|
| 653 |
+
│ ├── handwriting_regions/ # Region definitions
|
| 654 |
+
│ └── handwriting_tokens/ # Token images (subfolders per document)
|
| 655 |
+
│ ├── document_1/
|
| 656 |
+
│ │ ├── hw1_b3_l1_w0.png
|
| 657 |
+
│ │ └── ...
|
| 658 |
+
│ └── ...
|
| 659 |
+
│
|
| 660 |
+
├── visual_elements/ # Visual element data
|
| 661 |
+
│ ├── visual_element_definitions/ # Element definitions
|
| 662 |
+
│ └── visual_element_images/ # Element images (subfolders per document)
|
| 663 |
+
│ ├── document_1/
|
| 664 |
+
│ │ ├── ve0.png
|
| 665 |
+
│ │ └── ...
|
| 666 |
+
│ └── ...
|
| 667 |
+
│
|
| 668 |
+
├── layout/ # Layout element definitions
|
| 669 |
+
├── geometries/ # Extracted geometries
|
| 670 |
+
├── ocr_results/ # OCR results
|
| 671 |
+
├── analysis/ # Analysis statistics
|
| 672 |
+
└── debug/ # Debug visualizations
|
| 673 |
+
```
|
| 674 |
+
|
| 675 |
+
## Dataset Statistics
|
| 676 |
+
|
| 677 |
+
- **Total Documents**: {len(self.documents)}
|
| 678 |
+
- **Documents with Handwriting**: {sum(1 for d in self.documents if d['has_handwriting'])}
|
| 679 |
+
- **Documents with Visual Elements**: {sum(1 for d in self.documents if d['has_visual_elements'])}
|
| 680 |
+
- **Documents with OCR**: {sum(1 for d in self.documents if d['has_ocr'])}
|
| 681 |
+
|
| 682 |
+
## Usage
|
| 683 |
+
|
| 684 |
+
This dataset is designed for document understanding and OCR tasks. Files are organized by category for easy access and processing.
|
| 685 |
+
|
| 686 |
+
### Loading the Entire Dataset (Msgpack)
|
| 687 |
+
|
| 688 |
+
The easiest way to load all documents for ML training:
|
| 689 |
+
|
| 690 |
+
```python
|
| 691 |
+
from datadings.reader import MsgpackReader
|
| 692 |
+
|
| 693 |
+
# Load the aggregated dataset
|
| 694 |
+
reader = MsgpackReader('dataset.msgpack')
|
| 695 |
+
|
| 696 |
+
# Iterate through all documents
|
| 697 |
+
for sample in reader:
|
| 698 |
+
doc_id = sample['sample_id']
|
| 699 |
+
words = sample['words']
|
| 700 |
+
word_bboxes = sample['word_bboxes'] # Normalized [x0, y0, x2, y2]
|
| 701 |
+
image_path = sample['image_file_path']
|
| 702 |
+
# Ground truth annotations are included in the sample
|
| 703 |
+
```
|
| 704 |
+
|
| 705 |
+
For more information on msgpack format, see: https://github.com/mweiss/datadings
|
| 706 |
+
|
| 707 |
+
### Loading Individual Documents
|
| 708 |
+
|
| 709 |
+
Each document is identified by its `document_id` (e.g., "document_1"). To load a document:
|
| 710 |
+
|
| 711 |
+
1. **HTML/CSS**: `html/document_1.html`, `html/document_1.css`
|
| 712 |
+
2. **PDF stages**: Check `pdf/pdf_initial/`, `pdf/pdf_final/`, etc.
|
| 713 |
+
3. **Images**: `img/document_1.png`
|
| 714 |
+
4. **Annotations**: `annotations/gt/document_1.json`, `annotations/raw_annotations/document_1.json`
|
| 715 |
+
5. **Bounding boxes**:
|
| 716 |
+
- PDF-extracted (ground truth): `bbox/bbox_pdf/word/document_1.json`, `bbox/bbox_pdf/char/document_1.json`
|
| 717 |
+
- Final bboxes: `bbox/bbox_final/word/document_1.json` (OCR or PDF)
|
| 718 |
+
- Normalized: `bbox/bbox_final_normalized/word/document_1.json`
|
| 719 |
+
6. **Tokens**: `handwriting/handwriting_tokens/document_1/`, `visual_elements/visual_element_images/document_1/`
|
| 720 |
+
|
| 721 |
+
### Notes
|
| 722 |
+
|
| 723 |
+
- Bounding boxes in `bbox_pdf` are extracted from PDF and represent ground truth text positions
|
| 724 |
+
- Bounding boxes in `bbox_final` are from OCR (if document has handwriting/visual elements) or PDF (otherwise)
|
| 725 |
+
- Bounding boxes in `bbox_final_normalized` are normalized to [0, 1] range for ML training
|
| 726 |
+
- Character-level bboxes (`bbox_pdf/char/`) provide fine-grained text localization
|
| 727 |
+
- Raw annotations show the original layout boxes before normalization
|
| 728 |
+
- Token images are organized in per-document subfolders
|
| 729 |
+
- OCR results and analysis are only present if those features were enabled
|
| 730 |
+
|
| 731 |
+
---
|
| 732 |
+
Generated by DocGenie API v2.0
|
| 733 |
+
"""
|
api/example_usage.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example usage of the DocGenie API.
|
| 3 |
+
Demonstrates how to call the API and save generated documents.
|
| 4 |
+
"""
|
| 5 |
+
import asyncio
|
| 6 |
+
import base64
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import httpx
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
async def generate_documents_example():
|
| 14 |
+
"""
|
| 15 |
+
Example: Generate documents from seed images.
|
| 16 |
+
"""
|
| 17 |
+
# API endpoint
|
| 18 |
+
api_url = "http://localhost:8000/generate"
|
| 19 |
+
|
| 20 |
+
# Example seed image URLs (replace with your actual URLs)
|
| 21 |
+
seed_image_urls = [
|
| 22 |
+
"https://example.com/receipt1.jpg",
|
| 23 |
+
"https://example.com/receipt2.jpg",
|
| 24 |
+
# Add more seed image URLs here
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
# Request payload
|
| 28 |
+
payload = {
|
| 29 |
+
"seed_images": seed_image_urls,
|
| 30 |
+
"prompt_params": {
|
| 31 |
+
"language": "English",
|
| 32 |
+
"doc_type": "business and administrative documents",
|
| 33 |
+
"gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
|
| 34 |
+
"gt_format": '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}',
|
| 35 |
+
"num_solutions": 3
|
| 36 |
+
},
|
| 37 |
+
"model": "claude-sonnet-4-5-20250929"
|
| 38 |
+
# "api_key": "your-api-key" # Optional if ANTHROPIC_API_KEY env var is set
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
print("Sending request to DocGenie API...")
|
| 42 |
+
print(f"Seed images: {len(seed_image_urls)}")
|
| 43 |
+
print(f"Requested solutions: {payload['prompt_params']['num_solutions']}")
|
| 44 |
+
|
| 45 |
+
async with httpx.AsyncClient(timeout=300.0) as client:
|
| 46 |
+
response = await client.post(api_url, json=payload)
|
| 47 |
+
|
| 48 |
+
if response.status_code != 200:
|
| 49 |
+
print(f"Error: {response.status_code}")
|
| 50 |
+
print(response.text)
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
result = response.json()
|
| 54 |
+
|
| 55 |
+
print(f"\nSuccess! Generated {result['total_documents']} documents")
|
| 56 |
+
|
| 57 |
+
# Create output directory
|
| 58 |
+
output_dir = Path("api_output")
|
| 59 |
+
output_dir.mkdir(exist_ok=True)
|
| 60 |
+
|
| 61 |
+
# Process each generated document
|
| 62 |
+
for idx, doc in enumerate(result["documents"]):
|
| 63 |
+
doc_id = doc["document_id"]
|
| 64 |
+
print(f"\n--- Document {idx + 1} (ID: {doc_id}) ---")
|
| 65 |
+
|
| 66 |
+
# Save PDF
|
| 67 |
+
pdf_path = output_dir / f"{doc_id}.pdf"
|
| 68 |
+
pdf_bytes = base64.b64decode(doc["pdf_base64"])
|
| 69 |
+
with open(pdf_path, "wb") as f:
|
| 70 |
+
f.write(pdf_bytes)
|
| 71 |
+
print(f" PDF saved: {pdf_path}")
|
| 72 |
+
|
| 73 |
+
# Save HTML
|
| 74 |
+
html_path = output_dir / f"{doc_id}.html"
|
| 75 |
+
with open(html_path, "w", encoding="utf-8") as f:
|
| 76 |
+
f.write(doc["html"])
|
| 77 |
+
print(f" HTML saved: {html_path}")
|
| 78 |
+
|
| 79 |
+
# Save CSS
|
| 80 |
+
css_path = output_dir / f"{doc_id}.css"
|
| 81 |
+
with open(css_path, "w", encoding="utf-8") as f:
|
| 82 |
+
f.write(doc["css"])
|
| 83 |
+
print(f" CSS saved: {css_path}")
|
| 84 |
+
|
| 85 |
+
# Save ground truth
|
| 86 |
+
if doc["ground_truth"]:
|
| 87 |
+
gt_path = output_dir / f"{doc_id}_gt.json"
|
| 88 |
+
with open(gt_path, "w", encoding="utf-8") as f:
|
| 89 |
+
json.dump(doc["ground_truth"], f, indent=2, ensure_ascii=False)
|
| 90 |
+
print(f" Ground truth saved: {gt_path}")
|
| 91 |
+
print(f" GT entries: {len(doc['ground_truth'])}")
|
| 92 |
+
|
| 93 |
+
# Save bounding boxes
|
| 94 |
+
bbox_path = output_dir / f"{doc_id}_bboxes.json"
|
| 95 |
+
bboxes_data = [bbox for bbox in doc["bboxes"]]
|
| 96 |
+
with open(bbox_path, "w", encoding="utf-8") as f:
|
| 97 |
+
json.dump(bboxes_data, f, indent=2)
|
| 98 |
+
print(f" Bounding boxes saved: {bbox_path}")
|
| 99 |
+
print(f" BBox count: {len(doc['bboxes'])}")
|
| 100 |
+
|
| 101 |
+
# Print document info
|
| 102 |
+
print(f" Dimensions: {doc['page_width_mm']:.1f}mm x {doc['page_height_mm']:.1f}mm")
|
| 103 |
+
|
| 104 |
+
print(f"\n✅ All files saved to: {output_dir.absolute()}")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
async def health_check_example():
|
| 108 |
+
"""
|
| 109 |
+
Example: Check if the API is running.
|
| 110 |
+
"""
|
| 111 |
+
api_url = "http://localhost:8000/health"
|
| 112 |
+
|
| 113 |
+
print("Checking API health...")
|
| 114 |
+
|
| 115 |
+
async with httpx.AsyncClient() as client:
|
| 116 |
+
response = await client.get(api_url)
|
| 117 |
+
|
| 118 |
+
if response.status_code == 200:
|
| 119 |
+
result = response.json()
|
| 120 |
+
print(f"✅ API is healthy!")
|
| 121 |
+
print(f" Status: {result['status']}")
|
| 122 |
+
print(f" Version: {result['version']}")
|
| 123 |
+
else:
|
| 124 |
+
print(f"❌ API is not responding: {response.status_code}")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
print("DocGenie API - Example Usage\n")
|
| 129 |
+
|
| 130 |
+
# Run health check
|
| 131 |
+
asyncio.run(health_check_example())
|
| 132 |
+
|
| 133 |
+
print("\n" + "="*60 + "\n")
|
| 134 |
+
|
| 135 |
+
# Run document generation example
|
| 136 |
+
# NOTE: Replace seed_image_urls in the function with actual URLs
|
| 137 |
+
# asyncio.run(generate_documents_example())
|
| 138 |
+
|
| 139 |
+
print("\n⚠️ To run document generation:")
|
| 140 |
+
print(" 1. Make sure the API is running (python api/main.py)")
|
| 141 |
+
print(" 2. Replace seed_image_urls in this script with actual image URLs")
|
| 142 |
+
print(" 3. Set ANTHROPIC_API_KEY environment variable")
|
| 143 |
+
print(" 4. Uncomment the generate_documents_example() line above")
|
api/google_drive.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Google Drive integration for uploading generated documents.
|
| 3 |
+
Accepts OAuth tokens directly from frontend (no backend OAuth flow).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import io
|
| 7 |
+
import pathlib
|
| 8 |
+
from typing import Optional
|
| 9 |
+
from google.oauth2.credentials import Credentials
|
| 10 |
+
from googleapiclient.discovery import build
|
| 11 |
+
from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload
|
| 12 |
+
from googleapiclient.errors import HttpError
|
| 13 |
+
from google.auth.transport.requests import Request
|
| 14 |
+
from datetime import datetime, timedelta
|
| 15 |
+
|
| 16 |
+
from .config import settings
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class GoogleDriveClient:
|
| 20 |
+
"""Google Drive API client for file uploads using frontend-provided tokens"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, access_token: str, refresh_token: Optional[str] = None):
|
| 23 |
+
"""
|
| 24 |
+
Initialize Google Drive client with OAuth tokens from frontend.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
access_token: Google OAuth access token (provided by frontend)
|
| 28 |
+
refresh_token: Google OAuth refresh token (optional, for token renewal)
|
| 29 |
+
|
| 30 |
+
Raises:
|
| 31 |
+
ValueError: If token is invalid or expired
|
| 32 |
+
"""
|
| 33 |
+
self.access_token = access_token
|
| 34 |
+
self.refresh_token = refresh_token
|
| 35 |
+
self.credentials = self._create_credentials()
|
| 36 |
+
self.service = build('drive', 'v3', credentials=self.credentials)
|
| 37 |
+
|
| 38 |
+
def _create_credentials(self) -> Credentials:
|
| 39 |
+
"""Create credentials object from provided tokens"""
|
| 40 |
+
# Validate refresh token requirements
|
| 41 |
+
if self.refresh_token:
|
| 42 |
+
# If refresh_token is provided, we need client credentials for auto-refresh
|
| 43 |
+
if not settings.GOOGLE_CLIENT_ID or not settings.GOOGLE_CLIENT_SECRET:
|
| 44 |
+
raise ValueError(
|
| 45 |
+
"GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET must be set in .env "
|
| 46 |
+
"to support token refresh. Either:\n"
|
| 47 |
+
"1. Set these environment variables, OR\n"
|
| 48 |
+
"2. Ensure the access token doesn't expire during processing (get fresh token)"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
credentials = Credentials(
|
| 52 |
+
token=self.access_token,
|
| 53 |
+
refresh_token=self.refresh_token,
|
| 54 |
+
token_uri='https://oauth2.googleapis.com/token',
|
| 55 |
+
client_id=settings.GOOGLE_CLIENT_ID,
|
| 56 |
+
client_secret=settings.GOOGLE_CLIENT_SECRET,
|
| 57 |
+
scopes=['https://www.googleapis.com/auth/drive.file']
|
| 58 |
+
)
|
| 59 |
+
else:
|
| 60 |
+
# No refresh token - token must be valid for entire operation
|
| 61 |
+
credentials = Credentials(
|
| 62 |
+
token=self.access_token,
|
| 63 |
+
scopes=['https://www.googleapis.com/auth/drive.file']
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Try to refresh if expired upfront (only if refresh_token available)
|
| 67 |
+
if credentials.expired and credentials.refresh_token:
|
| 68 |
+
try:
|
| 69 |
+
print(f"[Google Drive] Token expired, refreshing...")
|
| 70 |
+
credentials.refresh(Request())
|
| 71 |
+
print(f"[Google Drive] Token refreshed successfully")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
raise ValueError(
|
| 74 |
+
f"Failed to refresh Google Drive token: {str(e)}. "
|
| 75 |
+
"User needs to re-authenticate."
|
| 76 |
+
)
|
| 77 |
+
elif credentials.expired:
|
| 78 |
+
raise ValueError(
|
| 79 |
+
"Google Drive token has expired and no refresh token provided. "
|
| 80 |
+
"User needs to re-authenticate with a fresh token."
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
return credentials
|
| 84 |
+
|
| 85 |
+
def upload_file(
|
| 86 |
+
self,
|
| 87 |
+
file_path: pathlib.Path,
|
| 88 |
+
filename: Optional[str] = None,
|
| 89 |
+
folder_name: str = "DocGenie Documents",
|
| 90 |
+
mime_type: str = "application/zip"
|
| 91 |
+
) -> str:
|
| 92 |
+
"""
|
| 93 |
+
Upload a file to user's Google Drive.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
file_path: Path to local file to upload
|
| 97 |
+
filename: Name for file in Google Drive (default: use file_path name)
|
| 98 |
+
folder_name: Name of folder to create/use in Drive
|
| 99 |
+
mime_type: MIME type of the file
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Google Drive file URL (shareable link)
|
| 103 |
+
|
| 104 |
+
Raises:
|
| 105 |
+
HttpError: If upload fails
|
| 106 |
+
"""
|
| 107 |
+
try:
|
| 108 |
+
# Get or create folder
|
| 109 |
+
folder_id = self._get_or_create_folder(folder_name)
|
| 110 |
+
|
| 111 |
+
# Prepare file metadata
|
| 112 |
+
file_metadata = {
|
| 113 |
+
'name': filename or file_path.name,
|
| 114 |
+
'parents': [folder_id]
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Upload file
|
| 118 |
+
media = MediaFileUpload(
|
| 119 |
+
str(file_path),
|
| 120 |
+
mimetype=mime_type,
|
| 121 |
+
resumable=True
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
file = self.service.files().create(
|
| 125 |
+
body=file_metadata,
|
| 126 |
+
media_body=media,
|
| 127 |
+
fields='id, webViewLink, webContentLink'
|
| 128 |
+
).execute()
|
| 129 |
+
|
| 130 |
+
# Make file accessible (reader permissions)
|
| 131 |
+
self._share_file(file['id'])
|
| 132 |
+
|
| 133 |
+
# Return shareable link
|
| 134 |
+
return file.get('webViewLink', file.get('webContentLink'))
|
| 135 |
+
|
| 136 |
+
except HttpError as error:
|
| 137 |
+
print(f"Google Drive upload error: {error}")
|
| 138 |
+
raise
|
| 139 |
+
|
| 140 |
+
def upload_bytes(
|
| 141 |
+
self,
|
| 142 |
+
file_bytes: bytes,
|
| 143 |
+
filename: str,
|
| 144 |
+
folder_name: str = "DocGenie Documents",
|
| 145 |
+
mime_type: str = "application/zip"
|
| 146 |
+
) -> str:
|
| 147 |
+
"""
|
| 148 |
+
Upload bytes directly to Google Drive (without saving to disk).
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
file_bytes: File content as bytes
|
| 152 |
+
filename: Name for file in Google Drive
|
| 153 |
+
folder_name: Name of folder to create/use in Drive
|
| 154 |
+
mime_type: MIME type of the file
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Google Drive file URL (shareable link)
|
| 158 |
+
"""
|
| 159 |
+
try:
|
| 160 |
+
folder_id = self._get_or_create_folder(folder_name)
|
| 161 |
+
|
| 162 |
+
file_metadata = {
|
| 163 |
+
'name': filename,
|
| 164 |
+
'parents': [folder_id]
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
# Create media from bytes
|
| 168 |
+
media = MediaIoBaseUpload(
|
| 169 |
+
io.BytesIO(file_bytes),
|
| 170 |
+
mimetype=mime_type,
|
| 171 |
+
resumable=True
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
file = self.service.files().create(
|
| 175 |
+
body=file_metadata,
|
| 176 |
+
media_body=media,
|
| 177 |
+
fields='id, webViewLink, webContentLink'
|
| 178 |
+
).execute()
|
| 179 |
+
|
| 180 |
+
self._share_file(file['id'])
|
| 181 |
+
|
| 182 |
+
return file.get('webViewLink', file.get('webContentLink'))
|
| 183 |
+
|
| 184 |
+
except HttpError as error:
|
| 185 |
+
print(f"Google Drive upload error: {error}")
|
| 186 |
+
raise
|
| 187 |
+
|
| 188 |
+
def _get_or_create_folder(self, folder_name: str) -> str:
|
| 189 |
+
"""
|
| 190 |
+
Get or create a folder in user's Google Drive.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
folder_name: Name of the folder
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
Folder ID
|
| 197 |
+
"""
|
| 198 |
+
# Search for existing folder
|
| 199 |
+
query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
|
| 200 |
+
results = self.service.files().list(
|
| 201 |
+
q=query,
|
| 202 |
+
spaces='drive',
|
| 203 |
+
fields='files(id, name)'
|
| 204 |
+
).execute()
|
| 205 |
+
|
| 206 |
+
folders = results.get('files', [])
|
| 207 |
+
|
| 208 |
+
if folders:
|
| 209 |
+
# Folder exists, return its ID
|
| 210 |
+
return folders[0]['id']
|
| 211 |
+
|
| 212 |
+
# Create new folder
|
| 213 |
+
file_metadata = {
|
| 214 |
+
'name': folder_name,
|
| 215 |
+
'mimeType': 'application/vnd.google-apps.folder'
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
folder = self.service.files().create(
|
| 219 |
+
body=file_metadata,
|
| 220 |
+
fields='id'
|
| 221 |
+
).execute()
|
| 222 |
+
|
| 223 |
+
return folder['id']
|
| 224 |
+
|
| 225 |
+
def _share_file(self, file_id: str):
|
| 226 |
+
"""
|
| 227 |
+
Make file shareable (anyone with link can view).
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
file_id: Google Drive file ID
|
| 231 |
+
"""
|
| 232 |
+
try:
|
| 233 |
+
permission = {
|
| 234 |
+
'type': 'anyone',
|
| 235 |
+
'role': 'reader'
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
self.service.permissions().create(
|
| 239 |
+
fileId=file_id,
|
| 240 |
+
body=permission
|
| 241 |
+
).execute()
|
| 242 |
+
|
| 243 |
+
except HttpError as error:
|
| 244 |
+
print(f"Warning: Could not share file {file_id}: {error}")
|
| 245 |
+
# Don't raise - file uploaded successfully even if sharing fails
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def upload_to_google_drive(
|
| 249 |
+
access_token: str,
|
| 250 |
+
file_path: pathlib.Path,
|
| 251 |
+
refresh_token: Optional[str] = None,
|
| 252 |
+
filename: Optional[str] = None
|
| 253 |
+
) -> str:
|
| 254 |
+
"""
|
| 255 |
+
Convenience function to upload a file to user's Google Drive.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
access_token: Google OAuth access token (from frontend)
|
| 259 |
+
file_path: Path to file to upload
|
| 260 |
+
refresh_token: Google OAuth refresh token (optional)
|
| 261 |
+
filename: Optional custom filename
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
Google Drive URL
|
| 265 |
+
|
| 266 |
+
Raises:
|
| 267 |
+
ValueError: If token is invalid or expired
|
| 268 |
+
HttpError: If upload fails
|
| 269 |
+
"""
|
| 270 |
+
client = GoogleDriveClient(access_token=access_token, refresh_token=refresh_token)
|
| 271 |
+
return client.upload_file(file_path, filename)
|
api/main.py
ADDED
|
@@ -0,0 +1,1756 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for DocGenie document generation.
|
| 3 |
+
|
| 4 |
+
FULLY INTEGRATED PIPELINE (All 19 Stages):
|
| 5 |
+
|
| 6 |
+
✅ Stage 1-2: Core Pipeline (Stages 01-06)
|
| 7 |
+
1. Seed Selection: Download and encode seed images
|
| 8 |
+
2. LLM Prompting: Call Claude API (batched client support)
|
| 9 |
+
3. Response Processing: Extract and validate HTML/GT
|
| 10 |
+
4. PDF Rendering: Generate PDFs with geometry extraction
|
| 11 |
+
5. BBox Extraction: Extract bounding boxes from PDFs
|
| 12 |
+
6. Validation: Verify geometries and bboxes
|
| 13 |
+
|
| 14 |
+
✅ Stage 3: Feature Synthesis (Stages 07-13)
|
| 15 |
+
7. Extract handwriting definitions from HTML
|
| 16 |
+
8. Extract visual element definitions from HTML
|
| 17 |
+
9. Generate handwriting images (WordStylist diffusion model)
|
| 18 |
+
10. Create visual elements (stamps, barcodes, logos)
|
| 19 |
+
11. Render second-pass PDF with features
|
| 20 |
+
12. Insert handwriting images into PDF
|
| 21 |
+
13. Insert visual elements into PDF
|
| 22 |
+
|
| 23 |
+
✅ Stage 4: Image Finalization & OCR (Stages 14-15)
|
| 24 |
+
14. Render final PDF to high-quality image (pdf2image)
|
| 25 |
+
15. Perform OCR on final image (Microsoft Document Intelligence)
|
| 26 |
+
|
| 27 |
+
✅ Stage 5: Dataset Packaging (Stages 16-19)
|
| 28 |
+
16. Normalize bounding boxes to [0,1] scale
|
| 29 |
+
17. Verify and prepare ground truth annotations
|
| 30 |
+
18. Generate document analysis and statistics
|
| 31 |
+
19. Create debug visualization overlays
|
| 32 |
+
|
| 33 |
+
See API_PIPELINE_STATUS.md for detailed integration status.
|
| 34 |
+
"""
|
| 35 |
+
import os
|
| 36 |
+
import sys
|
| 37 |
+
import pathlib
|
| 38 |
+
import tempfile
|
| 39 |
+
import uuid
|
| 40 |
+
import json
|
| 41 |
+
import zipfile
|
| 42 |
+
import asyncio
|
| 43 |
+
import shutil
|
| 44 |
+
import warnings
|
| 45 |
+
from typing import List, Optional
|
| 46 |
+
from contextlib import asynccontextmanager
|
| 47 |
+
|
| 48 |
+
# Suppress resource_tracker warnings in development mode (with uvicorn --reload)
|
| 49 |
+
# These warnings are harmless - they occur because the reloader creates child processes
|
| 50 |
+
# that share semaphores. The lifespan handler below ensures proper cleanup.
|
| 51 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="resource_tracker")
|
| 52 |
+
|
| 53 |
+
# Load environment variables from .env file if it exists
|
| 54 |
+
from dotenv import load_dotenv
|
| 55 |
+
load_dotenv()
|
| 56 |
+
|
| 57 |
+
# Add parent directory to path for docgenie imports
|
| 58 |
+
sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
|
| 59 |
+
|
| 60 |
+
from fastapi import FastAPI, HTTPException, status, BackgroundTasks
|
| 61 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 62 |
+
from fastapi.responses import FileResponse, StreamingResponse
|
| 63 |
+
import uvicorn
|
| 64 |
+
import io
|
| 65 |
+
|
| 66 |
+
from docgenie import ENV
|
| 67 |
+
|
| 68 |
+
from .schemas import (
|
| 69 |
+
GenerateDocumentRequest,
|
| 70 |
+
GenerateDocumentResponse,
|
| 71 |
+
DocumentResult,
|
| 72 |
+
BoundingBox,
|
| 73 |
+
HealthResponse,
|
| 74 |
+
DatasetExportInfo
|
| 75 |
+
)
|
| 76 |
+
from .utils import (
|
| 77 |
+
download_image_to_base64,
|
| 78 |
+
build_prompt,
|
| 79 |
+
call_claude_api_direct,
|
| 80 |
+
extract_html_documents_from_response,
|
| 81 |
+
extract_ground_truth,
|
| 82 |
+
extract_css_from_html,
|
| 83 |
+
render_html_to_pdf,
|
| 84 |
+
extract_bboxes_from_rendered_pdf,
|
| 85 |
+
pdf_to_base64,
|
| 86 |
+
validate_html_structure,
|
| 87 |
+
validate_pdf,
|
| 88 |
+
validate_bboxes,
|
| 89 |
+
process_stage3_complete,
|
| 90 |
+
process_stage4_ocr,
|
| 91 |
+
process_stage5_complete
|
| 92 |
+
)
|
| 93 |
+
from .config import settings
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Lifespan context manager for proper startup/shutdown
|
| 97 |
+
@asynccontextmanager
|
| 98 |
+
async def lifespan(app: FastAPI):
|
| 99 |
+
"""Handle application lifecycle - startup and shutdown."""
|
| 100 |
+
# Startup
|
| 101 |
+
print("🚀 DocGenie API starting up...")
|
| 102 |
+
yield
|
| 103 |
+
# Shutdown - give pending tasks time to complete
|
| 104 |
+
print("🛑 DocGenie API shutting down gracefully...")
|
| 105 |
+
await asyncio.sleep(0.5) # Allow pending async operations to complete
|
| 106 |
+
print("✓ Shutdown complete")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# Initialize FastAPI app with lifespan
|
| 110 |
+
app = FastAPI(
|
| 111 |
+
title="DocGenie API",
|
| 112 |
+
description="API for generating synthetic documents using LLMs",
|
| 113 |
+
version="1.0.0",
|
| 114 |
+
docs_url="/docs",
|
| 115 |
+
lifespan=lifespan
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Add CORS middleware
|
| 119 |
+
app.add_middleware(
|
| 120 |
+
CORSMiddleware,
|
| 121 |
+
allow_origins=settings.get_cors_origins(), # Configure in .env
|
| 122 |
+
allow_credentials=True,
|
| 123 |
+
allow_methods=["*"],
|
| 124 |
+
allow_headers=["*"],
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@app.get("/", response_model=HealthResponse)
|
| 129 |
+
async def root():
|
| 130 |
+
"""Root endpoint - health check."""
|
| 131 |
+
return HealthResponse(status="healthy", version="1.0.0")
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@app.get("/health", response_model=HealthResponse)
|
| 135 |
+
async def health_check():
|
| 136 |
+
"""Health check endpoint."""
|
| 137 |
+
return HealthResponse(status="healthy", version="1.0.0")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@app.post("/generate", response_model=GenerateDocumentResponse)
|
| 141 |
+
async def generate_documents(request: GenerateDocumentRequest):
|
| 142 |
+
"""
|
| 143 |
+
Generate synthetic documents from seed images.
|
| 144 |
+
|
| 145 |
+
Pipeline:
|
| 146 |
+
1. Download seed images from URLs
|
| 147 |
+
2. Convert images to base64
|
| 148 |
+
3. Build prompt with user parameters
|
| 149 |
+
4. Call Claude API
|
| 150 |
+
5. Extract HTML documents from response
|
| 151 |
+
6. Extract ground truth and CSS
|
| 152 |
+
7. Render HTML to PDF
|
| 153 |
+
8. Extract bounding boxes
|
| 154 |
+
9. Return results
|
| 155 |
+
"""
|
| 156 |
+
try:
|
| 157 |
+
# Step 1 & 2: Download and convert seed images to base64
|
| 158 |
+
print(f"Downloading {len(request.seed_images)} seed images...")
|
| 159 |
+
seed_images_base64 = []
|
| 160 |
+
|
| 161 |
+
# Parse request_id and handle assets
|
| 162 |
+
user_id_from_input, request_id = parse_request_id(request.request_id)
|
| 163 |
+
user_id = user_id_from_input
|
| 164 |
+
assets_temp_dir = None
|
| 165 |
+
|
| 166 |
+
# Download assets if possible
|
| 167 |
+
try:
|
| 168 |
+
from .supabase_client import supabase_client
|
| 169 |
+
# Try to get user_id from database if not in request_id
|
| 170 |
+
effective_user_id = user_id
|
| 171 |
+
if not effective_user_id:
|
| 172 |
+
effective_user_id = supabase_client.get_user_id_from_request(request_id)
|
| 173 |
+
|
| 174 |
+
if effective_user_id and request_id:
|
| 175 |
+
assets_path = f"{effective_user_id}/{request_id}/assets"
|
| 176 |
+
files = supabase_client.list_files("doc_storage", assets_path)
|
| 177 |
+
asset_files = [f for f in files if f.get('id') is not None]
|
| 178 |
+
|
| 179 |
+
if asset_files:
|
| 180 |
+
assets_temp_dir = pathlib.Path(tempfile.mkdtemp())
|
| 181 |
+
print(f"Found {len(asset_files)} assets in storage, downloading...")
|
| 182 |
+
for file_info in asset_files:
|
| 183 |
+
file_name = file_info['name']
|
| 184 |
+
try:
|
| 185 |
+
file_content = supabase_client.download_file("doc_storage", f"{assets_path}/{file_name}")
|
| 186 |
+
with open(assets_temp_dir / file_name, 'wb') as f:
|
| 187 |
+
f.write(file_content)
|
| 188 |
+
except Exception as e:
|
| 189 |
+
print(f" ⚠ Failed to download asset {file_name}: {e}")
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f" ⚠ Asset check failed: {e}")
|
| 192 |
+
|
| 193 |
+
for url in request.seed_images:
|
| 194 |
+
try:
|
| 195 |
+
img_b64 = await download_image_to_base64(str(url))
|
| 196 |
+
seed_images_base64.append(img_b64)
|
| 197 |
+
except Exception as e:
|
| 198 |
+
raise HTTPException(
|
| 199 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 200 |
+
detail=f"Failed to download image from {url}: {str(e)}"
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
print(f"Successfully downloaded {len(seed_images_base64)} images")
|
| 204 |
+
|
| 205 |
+
# Step 3: Build prompt
|
| 206 |
+
prompt_template_path = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
|
| 207 |
+
|
| 208 |
+
if not prompt_template_path.exists():
|
| 209 |
+
raise HTTPException(
|
| 210 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 211 |
+
detail=f"Prompt template not found at {prompt_template_path}"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
prompt = build_prompt(
|
| 215 |
+
language=request.prompt_params.language,
|
| 216 |
+
doc_type=request.prompt_params.doc_type,
|
| 217 |
+
gt_type=request.prompt_params.gt_type,
|
| 218 |
+
gt_format=request.prompt_params.gt_format,
|
| 219 |
+
num_solutions=request.prompt_params.num_solutions,
|
| 220 |
+
num_seed_images=len(seed_images_base64),
|
| 221 |
+
prompt_template_path=prompt_template_path,
|
| 222 |
+
enable_visual_elements=request.prompt_params.enable_visual_elements,
|
| 223 |
+
visual_element_types=request.prompt_params.visual_element_types
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
print("Prompt built successfully")
|
| 227 |
+
|
| 228 |
+
# Step 4: Call Claude API (using settings)
|
| 229 |
+
if not settings.ANTHROPIC_API_KEY:
|
| 230 |
+
raise HTTPException(
|
| 231 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 232 |
+
detail="ANTHROPIC_API_KEY environment variable not set"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
print(f"Calling Claude API with model {settings.CLAUDE_MODEL}...")
|
| 236 |
+
llm_response = await call_claude_api_direct(
|
| 237 |
+
prompt=prompt,
|
| 238 |
+
seed_images_base64=seed_images_base64,
|
| 239 |
+
api_key=settings.ANTHROPIC_API_KEY,
|
| 240 |
+
model=settings.CLAUDE_MODEL
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
print(f"Received LLM response ({len(llm_response)} chars)")
|
| 244 |
+
|
| 245 |
+
# Step 5: Extract HTML documents
|
| 246 |
+
html_documents = extract_html_documents_from_response(llm_response)
|
| 247 |
+
|
| 248 |
+
if not html_documents:
|
| 249 |
+
raise HTTPException(
|
| 250 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 251 |
+
detail="No valid HTML documents found in LLM response"
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
print(f"Extracted {len(html_documents)} HTML documents")
|
| 255 |
+
|
| 256 |
+
# Process each document
|
| 257 |
+
results = []
|
| 258 |
+
|
| 259 |
+
# Create temporary directory for PDFs
|
| 260 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 261 |
+
tmp_path = pathlib.Path(tmp_dir)
|
| 262 |
+
|
| 263 |
+
for idx, html in enumerate(html_documents):
|
| 264 |
+
try:
|
| 265 |
+
doc_id = f"{uuid.uuid4()}_{idx}"
|
| 266 |
+
print(f"Processing document {idx + 1}/{len(html_documents)} (ID: {doc_id})")
|
| 267 |
+
|
| 268 |
+
# Initialize original_pdf_path (will be set after rendering)
|
| 269 |
+
original_pdf_path = None
|
| 270 |
+
|
| 271 |
+
# Validate HTML structure (pipeline_03 validation)
|
| 272 |
+
is_valid, error_msg = validate_html_structure(html)
|
| 273 |
+
if not is_valid:
|
| 274 |
+
print(f" ⚠ HTML validation failed: {error_msg}")
|
| 275 |
+
continue
|
| 276 |
+
|
| 277 |
+
# Step 6: Extract ground truth and CSS (pipeline_03)
|
| 278 |
+
gt, html_clean = extract_ground_truth(html)
|
| 279 |
+
css, _ = extract_css_from_html(html_clean)
|
| 280 |
+
|
| 281 |
+
# DEBUG: Check if LLM generated handwriting classes
|
| 282 |
+
print(f"\n 🔍 DEBUG - Handwriting Detection:")
|
| 283 |
+
print(f" - Contains 'handwritten' class: {'handwritten' in html_clean}")
|
| 284 |
+
|
| 285 |
+
# Check for author classes (format: author1, author2, etc. - NO DASH)
|
| 286 |
+
import re
|
| 287 |
+
author_pattern = re.compile(r'\bauthor\d+\b')
|
| 288 |
+
author_matches = author_pattern.findall(html_clean)
|
| 289 |
+
|
| 290 |
+
if 'handwritten' in html_clean:
|
| 291 |
+
# Count occurrences
|
| 292 |
+
hw_count = html_clean.count('handwritten')
|
| 293 |
+
print(f" - 'handwritten' occurrences: {hw_count}")
|
| 294 |
+
print(f" - Author classes found: {len(author_matches)}")
|
| 295 |
+
if author_matches:
|
| 296 |
+
unique_authors = set(author_matches)
|
| 297 |
+
print(f" - Unique author IDs: {sorted(unique_authors)}")
|
| 298 |
+
else:
|
| 299 |
+
print(f" - ⚠️ NO author classes found (expected format: author1, author2, etc.)")
|
| 300 |
+
|
| 301 |
+
# Show first match context
|
| 302 |
+
idx = html_clean.find('handwritten')
|
| 303 |
+
context_start = max(0, idx - 50)
|
| 304 |
+
context_end = min(len(html_clean), idx + 150)
|
| 305 |
+
print(f" - First match context: ...{html_clean[context_start:context_end]}...")
|
| 306 |
+
else:
|
| 307 |
+
print(f" - ⚠️ NO handwriting classes found in LLM output!")
|
| 308 |
+
# Show sample of HTML to see structure
|
| 309 |
+
print(f" - HTML sample (first 500 chars): {html_clean[:500]}")
|
| 310 |
+
|
| 311 |
+
print(f" 🔍 DEBUG - Visual Elements Detection:")
|
| 312 |
+
print(f" - Contains 'data-placeholder': {'data-placeholder' in html_clean}")
|
| 313 |
+
if 'data-placeholder' in html_clean:
|
| 314 |
+
ve_count = html_clean.count('data-placeholder')
|
| 315 |
+
print(f" - 'data-placeholder' occurrences: {ve_count}")
|
| 316 |
+
print()
|
| 317 |
+
|
| 318 |
+
# Step 7: Render to PDF (pipeline_04) and extract geometries
|
| 319 |
+
pdf_path = tmp_path / f"{doc_id}.pdf"
|
| 320 |
+
pdf_path, width_mm, height_mm, geometries = await render_html_to_pdf(
|
| 321 |
+
html=html_clean,
|
| 322 |
+
output_pdf_path=pdf_path
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
print(f" ✓ Rendered PDF: {width_mm:.1f}mm x {height_mm:.1f}mm")
|
| 326 |
+
|
| 327 |
+
# Validate PDF (pipeline_06 style validation)
|
| 328 |
+
is_valid, error_msg = validate_pdf(pdf_path)
|
| 329 |
+
if not is_valid:
|
| 330 |
+
print(f" ⚠ PDF validation failed: {error_msg}")
|
| 331 |
+
continue
|
| 332 |
+
|
| 333 |
+
# Step 8: Extract bounding boxes (pipeline_05)
|
| 334 |
+
bboxes_raw = extract_bboxes_from_rendered_pdf(pdf_path)
|
| 335 |
+
|
| 336 |
+
# Validate bboxes (pipeline_06 style validation)
|
| 337 |
+
is_valid, error_msg = validate_bboxes(bboxes_raw, min_bbox_count=1)
|
| 338 |
+
if not is_valid:
|
| 339 |
+
print(f" ⚠ BBox validation failed: {error_msg}")
|
| 340 |
+
# Continue anyway with empty bboxes for API response
|
| 341 |
+
|
| 342 |
+
bboxes = [BoundingBox(**bbox) for bbox in bboxes_raw]
|
| 343 |
+
|
| 344 |
+
print(f" ✓ Extracted {len(bboxes)} bounding boxes")
|
| 345 |
+
|
| 346 |
+
# Step 9: Convert PDF to base64
|
| 347 |
+
pdf_b64 = pdf_to_base64(pdf_path)
|
| 348 |
+
|
| 349 |
+
# Step 10: Process Stage 3 (Handwriting & Visual Elements) if enabled
|
| 350 |
+
final_image_b64 = None
|
| 351 |
+
handwriting_regions = []
|
| 352 |
+
visual_elements = []
|
| 353 |
+
handwriting_images = {}
|
| 354 |
+
visual_element_images = {}
|
| 355 |
+
ocr_results = None
|
| 356 |
+
modified_pdf_path = None
|
| 357 |
+
|
| 358 |
+
# Track original PDF path before modification
|
| 359 |
+
original_pdf_path = pdf_path
|
| 360 |
+
|
| 361 |
+
if request.prompt_params.enable_handwriting or request.prompt_params.enable_visual_elements:
|
| 362 |
+
print(f" 🎨 Processing Stages 07-13 (Handwriting & Visual Elements)...")
|
| 363 |
+
|
| 364 |
+
try:
|
| 365 |
+
final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path = await process_stage3_complete(
|
| 366 |
+
pdf_path=pdf_path,
|
| 367 |
+
geometries=geometries,
|
| 368 |
+
ground_truth=gt,
|
| 369 |
+
bboxes_raw=bboxes_raw,
|
| 370 |
+
page_width_mm=width_mm,
|
| 371 |
+
page_height_mm=height_mm,
|
| 372 |
+
enable_handwriting=request.prompt_params.enable_handwriting,
|
| 373 |
+
handwriting_ratio=request.prompt_params.handwriting_ratio,
|
| 374 |
+
enable_visual_elements=request.prompt_params.enable_visual_elements,
|
| 375 |
+
visual_element_types=request.prompt_params.visual_element_types,
|
| 376 |
+
seed=request.prompt_params.seed,
|
| 377 |
+
assets_dir=assets_temp_dir,
|
| 378 |
+
barcode_number=request.prompt_params.barcode_number
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Use final PDF if modifications were made
|
| 382 |
+
if pdf_final_path and pdf_final_path.exists():
|
| 383 |
+
pdf_path = pdf_final_path
|
| 384 |
+
pdf_b64 = pdf_to_base64(pdf_path)
|
| 385 |
+
elif pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
|
| 386 |
+
pdf_path = pdf_with_handwriting_path
|
| 387 |
+
pdf_b64 = pdf_to_base64(pdf_path)
|
| 388 |
+
|
| 389 |
+
print(f" ✓ Stages 07-13 complete: {len(handwriting_regions)} handwriting regions, {len(visual_elements)} visual elements")
|
| 390 |
+
print(f" - Individual tokens: {len(handwriting_images)} handwriting, {len(visual_element_images)} visual elements")
|
| 391 |
+
|
| 392 |
+
except Exception as e:
|
| 393 |
+
print(f" ⚠ Stages 07-13 processing failed: {str(e)}")
|
| 394 |
+
# Continue with original PDF if Stage 3 fails
|
| 395 |
+
|
| 396 |
+
# Step 11: Process Stages 14-15 (Image Finalization & OCR) if needed
|
| 397 |
+
if request.prompt_params.enable_ocr or (final_image_b64 is None and (request.prompt_params.enable_handwriting or request.prompt_params.enable_visual_elements)):
|
| 398 |
+
print(f" 📄 Processing Stages 14-15 (Image Finalization & OCR)...")
|
| 399 |
+
|
| 400 |
+
try:
|
| 401 |
+
stage4_image, ocr_results = await process_stage4_ocr(
|
| 402 |
+
pdf_path=pdf_path,
|
| 403 |
+
enable_ocr=request.prompt_params.enable_ocr,
|
| 404 |
+
dpi=settings.OCR_DPI
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
# Use Stage 4 image if Stage 3 didn't generate one
|
| 408 |
+
if final_image_b64 is None and stage4_image:
|
| 409 |
+
final_image_b64 = stage4_image
|
| 410 |
+
|
| 411 |
+
if ocr_results:
|
| 412 |
+
print(f" ✓ Stages 14-15 complete: Image rendered, OCR: {len(ocr_results.get('words', []))} words")
|
| 413 |
+
else:
|
| 414 |
+
print(f" ✓ Stage 14 complete: Image rendered")
|
| 415 |
+
|
| 416 |
+
except Exception as e:
|
| 417 |
+
print(f" ⚠ Stages 14-15 processing failed: {str(e)}")
|
| 418 |
+
# Continue without Stage 4
|
| 419 |
+
|
| 420 |
+
# Step 12: Process Stages 16-18 (Dataset Packaging) if needed
|
| 421 |
+
stage5_results = {}
|
| 422 |
+
if any([
|
| 423 |
+
request.prompt_params.enable_bbox_normalization,
|
| 424 |
+
request.prompt_params.enable_gt_verification,
|
| 425 |
+
request.prompt_params.enable_analysis,
|
| 426 |
+
request.prompt_params.enable_debug_visualization
|
| 427 |
+
]):
|
| 428 |
+
print(f" 📦 Processing Stages 16-18 (Dataset Packaging)...")
|
| 429 |
+
|
| 430 |
+
try:
|
| 431 |
+
stage5_results = await process_stage5_complete(
|
| 432 |
+
document_id=doc_id,
|
| 433 |
+
pdf_path=pdf_path,
|
| 434 |
+
image_base64=final_image_b64,
|
| 435 |
+
ocr_results=ocr_results,
|
| 436 |
+
ground_truth=gt,
|
| 437 |
+
has_handwriting=request.prompt_params.enable_handwriting,
|
| 438 |
+
has_visual_elements=request.prompt_params.enable_visual_elements,
|
| 439 |
+
layout_elements=visual_elements, # Use visual elements as layout proxy
|
| 440 |
+
enable_bbox_normalization=request.prompt_params.enable_bbox_normalization,
|
| 441 |
+
enable_gt_verification=request.prompt_params.enable_gt_verification,
|
| 442 |
+
enable_analysis=request.prompt_params.enable_analysis,
|
| 443 |
+
enable_debug_visualization=request.prompt_params.enable_debug_visualization
|
| 444 |
+
)
|
| 445 |
+
print(f" ✓ Stages 16-18 complete")
|
| 446 |
+
except Exception as e:
|
| 447 |
+
print(f" ⚠ Stages 16-18 processing failed: {str(e)}")
|
| 448 |
+
# Continue without Stage 5
|
| 449 |
+
|
| 450 |
+
# Step 13: Export to dataset format if requested
|
| 451 |
+
dataset_export_info = None
|
| 452 |
+
if request.prompt_params.enable_dataset_export:
|
| 453 |
+
print(f" 📦 Exporting dataset format ({request.prompt_params.dataset_export_format})...")
|
| 454 |
+
|
| 455 |
+
try:
|
| 456 |
+
from .utils import export_to_msgpack
|
| 457 |
+
|
| 458 |
+
# Only msgpack format is currently supported
|
| 459 |
+
if request.prompt_params.dataset_export_format.lower() == "msgpack":
|
| 460 |
+
# Prepare data for export
|
| 461 |
+
export_words = []
|
| 462 |
+
export_word_bboxes = []
|
| 463 |
+
export_segment_bboxes = []
|
| 464 |
+
|
| 465 |
+
# Get normalized bboxes if available (Stage 5), otherwise use raw OCR
|
| 466 |
+
if stage5_results.get('normalized_bboxes_word'):
|
| 467 |
+
# Use Stage 5 normalized bboxes
|
| 468 |
+
for bbox_entry in stage5_results['normalized_bboxes_word']:
|
| 469 |
+
export_words.append(bbox_entry.get('text', ''))
|
| 470 |
+
bbox = bbox_entry.get('bbox', [0, 0, 1, 1])
|
| 471 |
+
export_word_bboxes.append(bbox)
|
| 472 |
+
|
| 473 |
+
if stage5_results.get('normalized_bboxes_segment'):
|
| 474 |
+
for bbox_entry in stage5_results['normalized_bboxes_segment']:
|
| 475 |
+
bbox = bbox_entry.get('bbox', [0, 0, 1, 1])
|
| 476 |
+
export_segment_bboxes.append(bbox)
|
| 477 |
+
elif ocr_results:
|
| 478 |
+
# Fallback: normalize OCR bboxes manually
|
| 479 |
+
from pdf2image import convert_from_path
|
| 480 |
+
images = convert_from_path(pdf_path, dpi=settings.OCR_DPI)
|
| 481 |
+
img_width, img_height = images[0].size if images else (1000, 1000)
|
| 482 |
+
|
| 483 |
+
for word in ocr_results.get('words', []):
|
| 484 |
+
export_words.append(word.get('text', ''))
|
| 485 |
+
bbox = word.get('bbox', {'x0': 0, 'y0': 0, 'x1': 1, 'y1': 1})
|
| 486 |
+
# Normalize to [0,1]
|
| 487 |
+
norm_bbox = [
|
| 488 |
+
bbox['x0'] / img_width,
|
| 489 |
+
bbox['y0'] / img_height,
|
| 490 |
+
bbox['x1'] / img_width,
|
| 491 |
+
bbox['y1'] / img_height
|
| 492 |
+
]
|
| 493 |
+
export_word_bboxes.append(norm_bbox)
|
| 494 |
+
export_segment_bboxes.append(norm_bbox) # Use words as segments
|
| 495 |
+
else:
|
| 496 |
+
print(f" ⚠ No OCR data available for msgpack export")
|
| 497 |
+
|
| 498 |
+
if export_words and export_word_bboxes:
|
| 499 |
+
# Create msgpack file in temp directory
|
| 500 |
+
msgpack_path = pathlib.Path(tempfile.gettempdir()) / f"{doc_id}_dataset.msgpack"
|
| 501 |
+
|
| 502 |
+
await export_to_msgpack(
|
| 503 |
+
document_id=doc_id,
|
| 504 |
+
image_path=None,
|
| 505 |
+
image_base64=final_image_b64,
|
| 506 |
+
words=export_words,
|
| 507 |
+
word_bboxes=export_word_bboxes,
|
| 508 |
+
segment_bboxes=export_segment_bboxes if export_segment_bboxes else export_word_bboxes,
|
| 509 |
+
ground_truth=gt,
|
| 510 |
+
output_path=msgpack_path,
|
| 511 |
+
image_width=None,
|
| 512 |
+
image_height=None
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
# Read msgpack file as base64 for response
|
| 516 |
+
if msgpack_path.exists():
|
| 517 |
+
with open(msgpack_path, 'rb') as f:
|
| 518 |
+
msgpack_bytes = f.read()
|
| 519 |
+
msgpack_b64 = base64.b64encode(msgpack_bytes).decode('utf-8')
|
| 520 |
+
|
| 521 |
+
dataset_export_info = DatasetExportInfo(
|
| 522 |
+
format="msgpack",
|
| 523 |
+
num_samples=1,
|
| 524 |
+
output_path=str(msgpack_path),
|
| 525 |
+
msgpack_base64=msgpack_b64 if len(msgpack_bytes) < 10_000_000 else None, # Only include if < 10MB
|
| 526 |
+
metadata={
|
| 527 |
+
"document_id": doc_id,
|
| 528 |
+
"num_words": len(export_words),
|
| 529 |
+
"has_ground_truth": gt is not None,
|
| 530 |
+
"has_ocr": ocr_results is not None
|
| 531 |
+
}
|
| 532 |
+
)
|
| 533 |
+
print(f" ✓ Dataset exported to msgpack: {msgpack_path}")
|
| 534 |
+
else:
|
| 535 |
+
print(f" ⚠ Export format '{request.prompt_params.dataset_export_format}' not supported. Only 'msgpack' is available.")
|
| 536 |
+
|
| 537 |
+
except Exception as e:
|
| 538 |
+
print(f" ⚠ Dataset export failed: {str(e)}")
|
| 539 |
+
import traceback
|
| 540 |
+
traceback.print_exc()
|
| 541 |
+
|
| 542 |
+
# Prepare individual tokens based on output_detail level
|
| 543 |
+
handwriting_token_images_response = None
|
| 544 |
+
visual_element_images_response = None
|
| 545 |
+
token_mapping_response = None
|
| 546 |
+
|
| 547 |
+
output_detail = request.prompt_params.output_detail
|
| 548 |
+
|
| 549 |
+
if output_detail in ["dataset", "complete"]:
|
| 550 |
+
# Include individual token images for dataset/complete levels
|
| 551 |
+
from .utils import create_token_mapping_json
|
| 552 |
+
|
| 553 |
+
if handwriting_images or visual_element_images:
|
| 554 |
+
handwriting_token_images_response = handwriting_images
|
| 555 |
+
visual_element_images_response = visual_element_images
|
| 556 |
+
token_mapping_response = create_token_mapping_json(
|
| 557 |
+
handwriting_regions,
|
| 558 |
+
handwriting_images,
|
| 559 |
+
visual_elements,
|
| 560 |
+
visual_element_images
|
| 561 |
+
)
|
| 562 |
+
print(f" 📦 Output detail '{output_detail}': Including {len(handwriting_images)} handwriting tokens, {len(visual_element_images)} visual elements")
|
| 563 |
+
|
| 564 |
+
# Create result
|
| 565 |
+
result = DocumentResult(
|
| 566 |
+
document_id=doc_id,
|
| 567 |
+
html=html_clean,
|
| 568 |
+
css=css,
|
| 569 |
+
ground_truth=gt,
|
| 570 |
+
pdf_base64=pdf_b64,
|
| 571 |
+
bboxes=bboxes,
|
| 572 |
+
page_width_mm=width_mm,
|
| 573 |
+
page_height_mm=height_mm,
|
| 574 |
+
image_base64=final_image_b64,
|
| 575 |
+
handwriting_regions=handwriting_regions,
|
| 576 |
+
visual_elements=visual_elements,
|
| 577 |
+
handwriting_token_images=handwriting_token_images_response,
|
| 578 |
+
visual_element_images=visual_element_images_response,
|
| 579 |
+
token_mapping=token_mapping_response,
|
| 580 |
+
ocr_results=ocr_results,
|
| 581 |
+
# Stage 5 results
|
| 582 |
+
normalized_bboxes_word=stage5_results.get('normalized_bboxes_word'),
|
| 583 |
+
normalized_bboxes_segment=stage5_results.get('normalized_bboxes_segment'),
|
| 584 |
+
gt_verification=stage5_results.get('gt_verification'),
|
| 585 |
+
analysis_stats=stage5_results.get('analysis_stats'),
|
| 586 |
+
debug_visualization=stage5_results.get('debug_visualization'),
|
| 587 |
+
dataset_export=dataset_export_info
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
results.append(result)
|
| 591 |
+
|
| 592 |
+
except Exception as e:
|
| 593 |
+
print(f"Error processing document {idx}: {str(e)}")
|
| 594 |
+
# Continue with other documents
|
| 595 |
+
continue
|
| 596 |
+
|
| 597 |
+
if not results:
|
| 598 |
+
raise HTTPException(
|
| 599 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 600 |
+
detail="Failed to process any documents"
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
print(f"Successfully generated {len(results)} documents")
|
| 604 |
+
|
| 605 |
+
# Add warning message for large responses
|
| 606 |
+
output_detail = request.prompt_params.output_detail
|
| 607 |
+
message = f"Successfully generated {len(results)} documents"
|
| 608 |
+
|
| 609 |
+
if output_detail == "complete":
|
| 610 |
+
message += " ⚠️ WARNING: 'complete' output detail level may result in 50+ MB response"
|
| 611 |
+
elif output_detail == "dataset":
|
| 612 |
+
message += " (dataset mode: includes individual tokens)"
|
| 613 |
+
|
| 614 |
+
return GenerateDocumentResponse(
|
| 615 |
+
success=True,
|
| 616 |
+
message=message,
|
| 617 |
+
documents=results,
|
| 618 |
+
total_documents=len(results)
|
| 619 |
+
)
|
| 620 |
+
|
| 621 |
+
except HTTPException:
|
| 622 |
+
raise
|
| 623 |
+
except Exception as e:
|
| 624 |
+
raise HTTPException(
|
| 625 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 626 |
+
detail=f"Internal server error: {str(e)}"
|
| 627 |
+
)
|
| 628 |
+
finally:
|
| 629 |
+
# Clean up assets directory if it exists
|
| 630 |
+
if 'assets_temp_dir' in locals() and assets_temp_dir and assets_temp_dir.exists():
|
| 631 |
+
try:
|
| 632 |
+
shutil.rmtree(assets_temp_dir, ignore_errors=True)
|
| 633 |
+
print(f"✓ Cleaned up assets directory {assets_temp_dir}")
|
| 634 |
+
except:
|
| 635 |
+
pass
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
def parse_request_id(input_str: str) -> tuple:
|
| 639 |
+
"""Extract user_id and request_id from input string (format: user_id/request_id or just request_id)."""
|
| 640 |
+
if "/" in input_str:
|
| 641 |
+
parts = input_str.split("/", 1)
|
| 642 |
+
return parts[0], parts[1]
|
| 643 |
+
return None, input_str
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
@app.post("/generate/pdf")
|
| 647 |
+
async def generate_document_pdf(
|
| 648 |
+
request: GenerateDocumentRequest,
|
| 649 |
+
background_tasks: BackgroundTasks
|
| 650 |
+
):
|
| 651 |
+
"""
|
| 652 |
+
Generate documents and return them as downloadable PDF files (FAST DEMO ENDPOINT).
|
| 653 |
+
|
| 654 |
+
This endpoint generates documents and returns a ZIP file immediately (20-60 seconds).
|
| 655 |
+
|
| 656 |
+
**Workflow:**
|
| 657 |
+
1. Frontend creates document_requests entry in Supabase with status="pending"
|
| 658 |
+
2. Frontend sends request_id to this endpoint along with tokens and seed images
|
| 659 |
+
3. API fetches existing request, validates, and starts generation
|
| 660 |
+
4. API updates status through: processing → generating → completed/failed
|
| 661 |
+
5. ZIP file is returned immediately
|
| 662 |
+
6. If google_drive_token provided: ZIP is uploaded to GDrive in background
|
| 663 |
+
|
| 664 |
+
**Request Parameters:**
|
| 665 |
+
- request_id: UUID of existing document_requests entry (required)
|
| 666 |
+
- seed_images: List of image URLs to use as document backgrounds (required)
|
| 667 |
+
- google_drive_token: OAuth token for GDrive upload (optional, enables backup)
|
| 668 |
+
- google_drive_refresh_token: Refresh token for GDrive (optional)
|
| 669 |
+
- prompt_params: Document generation parameters
|
| 670 |
+
|
| 671 |
+
**Use Cases:**
|
| 672 |
+
- Quick demos and testing (with direct Claude API)
|
| 673 |
+
- Production with progress tracking and GDrive backup
|
| 674 |
+
|
| 675 |
+
**For batch processing:** Use `/generate/async` (50% cheaper, 5-30 minutes)
|
| 676 |
+
"""
|
| 677 |
+
# Get request_id from database
|
| 678 |
+
user_id_from_input, request_id = parse_request_id(request.request_id)
|
| 679 |
+
user_id = user_id_from_input
|
| 680 |
+
supabase_enabled = False
|
| 681 |
+
gdrive_enabled = False
|
| 682 |
+
|
| 683 |
+
try:
|
| 684 |
+
# Import supabase_client
|
| 685 |
+
from .supabase_client import supabase_client
|
| 686 |
+
|
| 687 |
+
# Get existing request from database
|
| 688 |
+
existing_request = supabase_client.get_request(request_id)
|
| 689 |
+
if not existing_request:
|
| 690 |
+
raise HTTPException(
|
| 691 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 692 |
+
detail=f"Request {request_id} not found in database"
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
# Use user_id from input if available, otherwise from database
|
| 696 |
+
if not user_id:
|
| 697 |
+
user_id = existing_request["user_id"]
|
| 698 |
+
|
| 699 |
+
supabase_enabled = True
|
| 700 |
+
|
| 701 |
+
print(f"[Request {request_id}] Processing request for user {user_id}")
|
| 702 |
+
print(f"[Request {request_id}] Current status: {existing_request['status']}")
|
| 703 |
+
|
| 704 |
+
# Validate Google Drive token if provided
|
| 705 |
+
if request.google_drive_token:
|
| 706 |
+
gdrive_enabled = True
|
| 707 |
+
|
| 708 |
+
# Download assets from Supabase storage if they exist
|
| 709 |
+
assets_temp_dir = None
|
| 710 |
+
if supabase_enabled:
|
| 711 |
+
try:
|
| 712 |
+
assets_path = f"{user_id}/{request_id}/assets"
|
| 713 |
+
files = supabase_client.list_files("doc_storage", assets_path)
|
| 714 |
+
|
| 715 |
+
# Filter out directories
|
| 716 |
+
asset_files = [f for f in files if f.get('id') is not None]
|
| 717 |
+
|
| 718 |
+
if asset_files:
|
| 719 |
+
assets_temp_dir = pathlib.Path(tempfile.mkdtemp())
|
| 720 |
+
print(f"[Request {request_id}] Found {len(asset_files)} assets in storage, downloading...")
|
| 721 |
+
|
| 722 |
+
for file_info in asset_files:
|
| 723 |
+
file_name = file_info['name']
|
| 724 |
+
try:
|
| 725 |
+
file_content = supabase_client.download_file("doc_storage", f"{assets_path}/{file_name}")
|
| 726 |
+
with open(assets_temp_dir / file_name, 'wb') as f:
|
| 727 |
+
f.write(file_content)
|
| 728 |
+
print(f" ✓ Downloaded {file_name}")
|
| 729 |
+
except Exception as download_err:
|
| 730 |
+
print(f" ⚠ Failed to download {file_name}: {download_err}")
|
| 731 |
+
else:
|
| 732 |
+
print(f"[Request {request_id}] No assets found in {assets_path}")
|
| 733 |
+
except Exception as e:
|
| 734 |
+
print(f"[Request {request_id}] ⚠ Asset check/download failed: {e}")
|
| 735 |
+
print(f"[Request {request_id}] GDrive integration enabled")
|
| 736 |
+
|
| 737 |
+
# Log analytics
|
| 738 |
+
try:
|
| 739 |
+
supabase_client.log_analytics_event(
|
| 740 |
+
user_id=user_id,
|
| 741 |
+
event_type="document_generation_started_sync",
|
| 742 |
+
entity_id=request_id
|
| 743 |
+
)
|
| 744 |
+
except Exception as e:
|
| 745 |
+
print(f"[Request {request_id}] Warning: Analytics logging failed: {e}")
|
| 746 |
+
|
| 747 |
+
except HTTPException:
|
| 748 |
+
raise
|
| 749 |
+
except Exception as e:
|
| 750 |
+
print(f"Error: Failed to fetch request from database: {e}")
|
| 751 |
+
raise HTTPException(
|
| 752 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 753 |
+
detail=f"Failed to fetch request: {str(e)}"
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
# Update status: Downloading seed images
|
| 757 |
+
if supabase_enabled:
|
| 758 |
+
try:
|
| 759 |
+
supabase_client.update_request_status(request_id, "downloading")
|
| 760 |
+
print(f"[Request {request_id}] Status: downloading (fetching seed images)")
|
| 761 |
+
except Exception as e:
|
| 762 |
+
print(f"Warning: Status update failed: {e}")
|
| 763 |
+
|
| 764 |
+
try:
|
| 765 |
+
# Step 1 & 2: Download and convert seed images to base64
|
| 766 |
+
print(f"Downloading {len(request.seed_images)} seed images...")
|
| 767 |
+
seed_images_base64 = []
|
| 768 |
+
for url in request.seed_images:
|
| 769 |
+
try:
|
| 770 |
+
img_b64 = await download_image_to_base64(str(url))
|
| 771 |
+
seed_images_base64.append(img_b64)
|
| 772 |
+
except Exception as e:
|
| 773 |
+
raise HTTPException(
|
| 774 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 775 |
+
detail=f"Failed to download image from {url}: {str(e)}"
|
| 776 |
+
)
|
| 777 |
+
|
| 778 |
+
print(f"Successfully downloaded {len(seed_images_base64)} images")
|
| 779 |
+
|
| 780 |
+
# Step 3: Build prompt
|
| 781 |
+
prompt_template_path = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
|
| 782 |
+
|
| 783 |
+
if not prompt_template_path.exists():
|
| 784 |
+
raise HTTPException(
|
| 785 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 786 |
+
detail=f"Prompt template not found at {prompt_template_path}"
|
| 787 |
+
)
|
| 788 |
+
|
| 789 |
+
prompt = build_prompt(
|
| 790 |
+
language=request.prompt_params.language,
|
| 791 |
+
doc_type=request.prompt_params.doc_type,
|
| 792 |
+
gt_type=request.prompt_params.gt_type,
|
| 793 |
+
gt_format=request.prompt_params.gt_format,
|
| 794 |
+
num_solutions=request.prompt_params.num_solutions,
|
| 795 |
+
num_seed_images=len(seed_images_base64),
|
| 796 |
+
prompt_template_path=prompt_template_path,
|
| 797 |
+
enable_visual_elements=request.prompt_params.enable_visual_elements,
|
| 798 |
+
visual_element_types=request.prompt_params.visual_element_types
|
| 799 |
+
)
|
| 800 |
+
|
| 801 |
+
print("Prompt built successfully")
|
| 802 |
+
|
| 803 |
+
# Update status: Generating (calling LLM)
|
| 804 |
+
if supabase_enabled:
|
| 805 |
+
try:
|
| 806 |
+
supabase_client.update_request_status(request_id, "generating")
|
| 807 |
+
print(f"[Request {request_id}] Status: generating (calling LLM)")
|
| 808 |
+
except Exception as e:
|
| 809 |
+
print(f"Warning: Status update failed: {e}")
|
| 810 |
+
|
| 811 |
+
# Step 4: Call Claude API (using settings)
|
| 812 |
+
print(f"Calling Claude API with model {settings.CLAUDE_MODEL}...")
|
| 813 |
+
llm_response = await call_claude_api_direct(
|
| 814 |
+
prompt=prompt,
|
| 815 |
+
seed_images_base64=seed_images_base64,
|
| 816 |
+
api_key=settings.ANTHROPIC_API_KEY,
|
| 817 |
+
model=settings.CLAUDE_MODEL
|
| 818 |
+
)
|
| 819 |
+
|
| 820 |
+
print(f"Received LLM response ({len(llm_response)} chars)")
|
| 821 |
+
|
| 822 |
+
# Step 5: Extract HTML documents
|
| 823 |
+
html_documents = extract_html_documents_from_response(llm_response)
|
| 824 |
+
|
| 825 |
+
if not html_documents:
|
| 826 |
+
raise HTTPException(
|
| 827 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 828 |
+
detail="No valid HTML documents found in LLM response"
|
| 829 |
+
)
|
| 830 |
+
|
| 831 |
+
print(f"Extracted {len(html_documents)} HTML documents")
|
| 832 |
+
|
| 833 |
+
# Extract output_detail early to use in ZIP packaging later
|
| 834 |
+
output_detail = request.prompt_params.output_detail
|
| 835 |
+
|
| 836 |
+
# Create temporary directory for PDFs
|
| 837 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 838 |
+
tmp_path = pathlib.Path(tmp_dir)
|
| 839 |
+
|
| 840 |
+
# Initialize DatasetExporter for organized structure
|
| 841 |
+
from .dataset_exporter import DatasetExporter
|
| 842 |
+
exporter = DatasetExporter(tmp_path, dataset_name="docgenie_documents")
|
| 843 |
+
|
| 844 |
+
pdf_files = []
|
| 845 |
+
metadata = []
|
| 846 |
+
|
| 847 |
+
for idx, html in enumerate(html_documents):
|
| 848 |
+
try:
|
| 849 |
+
doc_id = f"document_{idx + 1}"
|
| 850 |
+
print(f"Processing document {idx + 1}/{len(html_documents)} (ID: {doc_id})")
|
| 851 |
+
|
| 852 |
+
# Initialize original_pdf_path (will be set after rendering)
|
| 853 |
+
original_pdf_path = None
|
| 854 |
+
|
| 855 |
+
# Extract ground truth
|
| 856 |
+
gt, html_clean = extract_ground_truth(html)
|
| 857 |
+
|
| 858 |
+
# DEBUG: Check if LLM generated handwriting classes
|
| 859 |
+
print(f"\n 🔍 DEBUG - Handwriting Detection:")
|
| 860 |
+
print(f" - Contains 'handwritten' class: {'handwritten' in html_clean}")
|
| 861 |
+
|
| 862 |
+
# Check for author classes (format: author1, author2, etc. - NO DASH)
|
| 863 |
+
import re
|
| 864 |
+
author_pattern = re.compile(r'\bauthor\d+\b')
|
| 865 |
+
author_matches = author_pattern.findall(html_clean)
|
| 866 |
+
|
| 867 |
+
if 'handwritten' in html_clean:
|
| 868 |
+
# Count occurrences
|
| 869 |
+
hw_count = html_clean.count('handwritten')
|
| 870 |
+
print(f" - 'handwritten' occurrences: {hw_count}")
|
| 871 |
+
print(f" - Author classes found: {len(author_matches)}")
|
| 872 |
+
if author_matches:
|
| 873 |
+
unique_authors = set(author_matches)
|
| 874 |
+
print(f" - Unique author IDs: {sorted(unique_authors)}")
|
| 875 |
+
else:
|
| 876 |
+
print(f" - ⚠️ NO author classes found (expected format: author1, author2, etc.)")
|
| 877 |
+
|
| 878 |
+
# Show first match context
|
| 879 |
+
idx = html_clean.find('handwritten')
|
| 880 |
+
context_start = max(0, idx - 50)
|
| 881 |
+
context_end = min(len(html_clean), idx + 150)
|
| 882 |
+
print(f" - First match context: ...{html_clean[context_start:context_end]}...")
|
| 883 |
+
else:
|
| 884 |
+
print(f" - ⚠️ NO handwriting classes found in LLM output!")
|
| 885 |
+
# Show sample of HTML to see structure
|
| 886 |
+
print(f" - HTML sample (first 500 chars): {html_clean[:500]}")
|
| 887 |
+
|
| 888 |
+
print(f" 🔍 DEBUG - Visual Elements Detection:")
|
| 889 |
+
print(f" - Contains 'data-placeholder': {'data-placeholder' in html_clean}")
|
| 890 |
+
if 'data-placeholder' in html_clean:
|
| 891 |
+
ve_count = html_clean.count('data-placeholder')
|
| 892 |
+
print(f" - 'data-placeholder' occurrences: {ve_count}")
|
| 893 |
+
print()
|
| 894 |
+
|
| 895 |
+
# Render to PDF and extract geometries
|
| 896 |
+
pdf_path = tmp_path / f"{doc_id}.pdf"
|
| 897 |
+
pdf_path, width_mm, height_mm, geometries = await render_html_to_pdf(
|
| 898 |
+
html=html_clean,
|
| 899 |
+
output_pdf_path=pdf_path
|
| 900 |
+
)
|
| 901 |
+
|
| 902 |
+
print(f" - Rendered PDF: {width_mm:.1f}mm x {height_mm:.1f}mm")
|
| 903 |
+
|
| 904 |
+
# Extract bounding boxes
|
| 905 |
+
bboxes_raw = extract_bboxes_from_rendered_pdf(pdf_path)
|
| 906 |
+
|
| 907 |
+
print(f" - Extracted {len(bboxes_raw)} bounding boxes")
|
| 908 |
+
|
| 909 |
+
# Extract CSS for Stage 3
|
| 910 |
+
css, _ = extract_css_from_html(html_clean)
|
| 911 |
+
|
| 912 |
+
# Step: Process Stage 3 (Handwriting & Visual Elements) if enabled
|
| 913 |
+
final_image_b64 = None
|
| 914 |
+
handwriting_regions = []
|
| 915 |
+
visual_elements = []
|
| 916 |
+
handwriting_images = {}
|
| 917 |
+
visual_element_images = {}
|
| 918 |
+
ocr_results = None
|
| 919 |
+
pdf_with_handwriting_path = None
|
| 920 |
+
pdf_final_path = None
|
| 921 |
+
|
| 922 |
+
# Track original PDF path before modification
|
| 923 |
+
original_pdf_path = pdf_path
|
| 924 |
+
|
| 925 |
+
if request.prompt_params.enable_handwriting or request.prompt_params.enable_visual_elements:
|
| 926 |
+
print(f" 🎨 Processing Stages 07-13 (Handwriting & Visual Elements)...")
|
| 927 |
+
|
| 928 |
+
try:
|
| 929 |
+
final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path = await process_stage3_complete(
|
| 930 |
+
pdf_path=pdf_path,
|
| 931 |
+
geometries=geometries,
|
| 932 |
+
ground_truth=gt,
|
| 933 |
+
bboxes_raw=bboxes_raw,
|
| 934 |
+
page_width_mm=width_mm,
|
| 935 |
+
page_height_mm=height_mm,
|
| 936 |
+
enable_handwriting=request.prompt_params.enable_handwriting,
|
| 937 |
+
handwriting_ratio=request.prompt_params.handwriting_ratio,
|
| 938 |
+
enable_visual_elements=request.prompt_params.enable_visual_elements,
|
| 939 |
+
visual_element_types=request.prompt_params.visual_element_types,
|
| 940 |
+
seed=request.prompt_params.seed,
|
| 941 |
+
assets_dir=assets_temp_dir,
|
| 942 |
+
barcode_number=request.prompt_params.barcode_number
|
| 943 |
+
)
|
| 944 |
+
|
| 945 |
+
# Use final PDF if modifications were made
|
| 946 |
+
if pdf_final_path and pdf_final_path.exists():
|
| 947 |
+
pdf_path = pdf_final_path
|
| 948 |
+
elif pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
|
| 949 |
+
pdf_path = pdf_with_handwriting_path
|
| 950 |
+
|
| 951 |
+
print(f" ✓ Stages 07-13 complete: {len(handwriting_regions)} handwriting regions, {len(visual_elements)} visual elements")
|
| 952 |
+
print(f" - Individual tokens: {len(handwriting_images)} handwriting, {len(visual_element_images)} visual elements")
|
| 953 |
+
|
| 954 |
+
except Exception as e:
|
| 955 |
+
print(f" ⚠ Stages 07-13 processing failed: {str(e)}")
|
| 956 |
+
# Continue with original PDF if Stage 3 fails
|
| 957 |
+
|
| 958 |
+
# Step: Process Stages 14-15 (Image Finalization & OCR) if needed
|
| 959 |
+
if request.prompt_params.enable_ocr:
|
| 960 |
+
print(f" 📄 Processing Stages 14-15 (OCR)...")
|
| 961 |
+
|
| 962 |
+
try:
|
| 963 |
+
stage4_image, ocr_results = await process_stage4_ocr(
|
| 964 |
+
pdf_path=pdf_path,
|
| 965 |
+
enable_ocr=True,
|
| 966 |
+
dpi=settings.OCR_DPI
|
| 967 |
+
)
|
| 968 |
+
|
| 969 |
+
if ocr_results:
|
| 970 |
+
print(f" ✓ Stages 14-15 complete: OCR: {len(ocr_results.get('words', []))} words")
|
| 971 |
+
|
| 972 |
+
except Exception as e:
|
| 973 |
+
print(f" ⚠ Stages 14-15 processing failed: {str(e)}")
|
| 974 |
+
# Continue without Stage 4
|
| 975 |
+
|
| 976 |
+
# Step: Process Stages 16-18 (Dataset Packaging) if needed
|
| 977 |
+
stage5_results = {}
|
| 978 |
+
if any([
|
| 979 |
+
request.prompt_params.enable_bbox_normalization,
|
| 980 |
+
request.prompt_params.enable_gt_verification,
|
| 981 |
+
request.prompt_params.enable_analysis,
|
| 982 |
+
request.prompt_params.enable_debug_visualization
|
| 983 |
+
]):
|
| 984 |
+
print(f" 📦 Processing Stages 16-18 (Dataset Packaging)...")
|
| 985 |
+
|
| 986 |
+
try:
|
| 987 |
+
stage5_results = await process_stage5_complete(
|
| 988 |
+
document_id=doc_id,
|
| 989 |
+
pdf_path=pdf_path,
|
| 990 |
+
image_base64=final_image_b64,
|
| 991 |
+
ocr_results=ocr_results,
|
| 992 |
+
ground_truth=gt,
|
| 993 |
+
has_handwriting=request.prompt_params.enable_handwriting,
|
| 994 |
+
has_visual_elements=request.prompt_params.enable_visual_elements,
|
| 995 |
+
layout_elements=visual_elements,
|
| 996 |
+
enable_bbox_normalization=request.prompt_params.enable_bbox_normalization,
|
| 997 |
+
enable_gt_verification=request.prompt_params.enable_gt_verification,
|
| 998 |
+
enable_analysis=request.prompt_params.enable_analysis,
|
| 999 |
+
enable_debug_visualization=request.prompt_params.enable_debug_visualization
|
| 1000 |
+
)
|
| 1001 |
+
print(f" ✓ Stages 16-18 complete")
|
| 1002 |
+
except Exception as e:
|
| 1003 |
+
print(f" ⚠ Stages 16-18 processing failed: {str(e)}")
|
| 1004 |
+
# Continue without Stages 16-18
|
| 1005 |
+
|
| 1006 |
+
# Track PDFs for metadata
|
| 1007 |
+
if original_pdf_path and pdf_path != original_pdf_path:
|
| 1008 |
+
pdf_files.append(original_pdf_path)
|
| 1009 |
+
pdf_files.append(pdf_path)
|
| 1010 |
+
else:
|
| 1011 |
+
pdf_files.append(pdf_path)
|
| 1012 |
+
|
| 1013 |
+
# Extract bbox_pdf (word + char) from original PDF (ground truth positions)
|
| 1014 |
+
from .utils import extract_all_bboxes_from_pdf, extract_raw_annotations_from_geometries
|
| 1015 |
+
print(f" 📦 Extracting bbox_pdf (word + char level) from original PDF...")
|
| 1016 |
+
|
| 1017 |
+
try:
|
| 1018 |
+
bboxes_pdf = extract_all_bboxes_from_pdf(original_pdf_path if original_pdf_path else pdf_path)
|
| 1019 |
+
bbox_pdf_word = bboxes_pdf.get('word', [])
|
| 1020 |
+
bbox_pdf_char = bboxes_pdf.get('char', [])
|
| 1021 |
+
print(f" ✓ Extracted {len(bbox_pdf_word)} word bboxes, {len(bbox_pdf_char)} char bboxes from PDF")
|
| 1022 |
+
except Exception as e:
|
| 1023 |
+
print(f" ⚠ bbox_pdf extraction failed: {e}")
|
| 1024 |
+
bbox_pdf_word = bboxes_raw # Fallback to raw bboxes
|
| 1025 |
+
bbox_pdf_char = []
|
| 1026 |
+
|
| 1027 |
+
# Extract raw_annotations (layout boxes before normalization)
|
| 1028 |
+
raw_annotations = None
|
| 1029 |
+
if geometries:
|
| 1030 |
+
print(f" 📦 Extracting raw_annotations from geometries...")
|
| 1031 |
+
try:
|
| 1032 |
+
raw_annotations = extract_raw_annotations_from_geometries(geometries)
|
| 1033 |
+
print(f" ✓ Extracted {len(raw_annotations)} layout annotations")
|
| 1034 |
+
except Exception as e:
|
| 1035 |
+
print(f" ⚠ raw_annotations extraction failed: {e}")
|
| 1036 |
+
|
| 1037 |
+
# Decode final image to bytes
|
| 1038 |
+
final_image_bytes = None
|
| 1039 |
+
if final_image_b64:
|
| 1040 |
+
import base64
|
| 1041 |
+
final_image_bytes = base64.b64decode(final_image_b64)
|
| 1042 |
+
|
| 1043 |
+
# Decode debug visualization
|
| 1044 |
+
debug_viz_bytes = None
|
| 1045 |
+
if stage5_results.get('debug_visualization'):
|
| 1046 |
+
debug_viz_dict = stage5_results['debug_visualization']
|
| 1047 |
+
if debug_viz_dict and 'bbox_overlay_base64' in debug_viz_dict:
|
| 1048 |
+
debug_viz_b64 = debug_viz_dict['bbox_overlay_base64']
|
| 1049 |
+
debug_viz_bytes = base64.b64decode(debug_viz_b64)
|
| 1050 |
+
|
| 1051 |
+
# Prepare token mapping if tokens exist
|
| 1052 |
+
token_mapping_data = None
|
| 1053 |
+
if output_detail in ["dataset", "complete"]:
|
| 1054 |
+
if handwriting_images or visual_element_images:
|
| 1055 |
+
from .utils import create_token_mapping_json
|
| 1056 |
+
token_mapping_data = create_token_mapping_json(
|
| 1057 |
+
handwriting_regions,
|
| 1058 |
+
handwriting_images,
|
| 1059 |
+
visual_elements,
|
| 1060 |
+
visual_element_images
|
| 1061 |
+
)
|
| 1062 |
+
print(f" 📦 Output detail '{output_detail}': Prepared {len(handwriting_images)} handwriting tokens, {len(visual_element_images)} visual elements")
|
| 1063 |
+
|
| 1064 |
+
# Extract bbox_final_word and bbox_final_segment (from OCR or PDF)
|
| 1065 |
+
bbox_final_word = None
|
| 1066 |
+
bbox_final_segment = None
|
| 1067 |
+
if ocr_results and ocr_results.get('words'):
|
| 1068 |
+
# Use OCR results as final bboxes
|
| 1069 |
+
bbox_final_word = ocr_results.get('words', [])
|
| 1070 |
+
bbox_final_segment = ocr_results.get('lines', [])
|
| 1071 |
+
else:
|
| 1072 |
+
# Fallback to PDF bboxes if no OCR
|
| 1073 |
+
bbox_final_word = bbox_pdf_word
|
| 1074 |
+
bbox_final_segment = [] # No line-level data without OCR
|
| 1075 |
+
|
| 1076 |
+
# Read PDF bytes for exporter (capture all stages)
|
| 1077 |
+
pdf_initial_bytes = original_pdf_path.read_bytes()
|
| 1078 |
+
pdf_with_handwriting_bytes = pdf_with_handwriting_path.read_bytes() if pdf_with_handwriting_path and pdf_with_handwriting_path.exists() else None
|
| 1079 |
+
pdf_final_bytes = pdf_final_path.read_bytes() if pdf_final_path and pdf_final_path.exists() else None
|
| 1080 |
+
|
| 1081 |
+
# For visual elements only (no handwriting), pdf_final_path actually points to the VE-only PDF
|
| 1082 |
+
pdf_with_visual_elements_bytes = None
|
| 1083 |
+
if pdf_final_bytes and not pdf_with_handwriting_bytes:
|
| 1084 |
+
# Only visual elements were added, not handwriting
|
| 1085 |
+
pdf_with_visual_elements_bytes = pdf_final_bytes
|
| 1086 |
+
pdf_final_bytes = None # No "final" with both modifications
|
| 1087 |
+
|
| 1088 |
+
# Add document to exporter
|
| 1089 |
+
print(f" 📦 Adding document to dataset exporter...")
|
| 1090 |
+
exporter.add_document(
|
| 1091 |
+
document_id=doc_id,
|
| 1092 |
+
html=html_clean,
|
| 1093 |
+
css=css,
|
| 1094 |
+
pdf_initial=pdf_initial_bytes,
|
| 1095 |
+
pdf_with_handwriting=pdf_with_handwriting_bytes,
|
| 1096 |
+
pdf_with_visual_elements=pdf_with_visual_elements_bytes,
|
| 1097 |
+
pdf_final=pdf_final_bytes,
|
| 1098 |
+
final_image=final_image_bytes,
|
| 1099 |
+
ground_truth=gt,
|
| 1100 |
+
raw_annotations=raw_annotations,
|
| 1101 |
+
bboxes_pdf_word=bbox_pdf_word,
|
| 1102 |
+
bboxes_pdf_char=bbox_pdf_char,
|
| 1103 |
+
bboxes_final_word=bbox_final_word,
|
| 1104 |
+
bboxes_final_segment=bbox_final_segment,
|
| 1105 |
+
bboxes_normalized_word=stage5_results.get('normalized_bboxes_word'),
|
| 1106 |
+
bboxes_normalized_segment=stage5_results.get('normalized_bboxes_segment'),
|
| 1107 |
+
gt_verification=stage5_results.get('gt_verification'),
|
| 1108 |
+
token_mapping=token_mapping_data,
|
| 1109 |
+
handwriting_regions=handwriting_regions,
|
| 1110 |
+
handwriting_images=handwriting_images,
|
| 1111 |
+
visual_elements=visual_elements,
|
| 1112 |
+
visual_element_images=visual_element_images,
|
| 1113 |
+
layout_elements=visual_elements,
|
| 1114 |
+
geometries=geometries,
|
| 1115 |
+
ocr_results=ocr_results,
|
| 1116 |
+
analysis_stats=stage5_results.get('analysis_stats'),
|
| 1117 |
+
debug_visualization=debug_viz_bytes
|
| 1118 |
+
)
|
| 1119 |
+
print(f" ✓ Document {doc_id} added to dataset")
|
| 1120 |
+
|
| 1121 |
+
# Store metadata
|
| 1122 |
+
metadata.append({
|
| 1123 |
+
"document_id": doc_id,
|
| 1124 |
+
"filename": f"{doc_id}.pdf",
|
| 1125 |
+
"bboxes": bboxes_raw,
|
| 1126 |
+
"ground_truth": gt,
|
| 1127 |
+
"geometries": geometries,
|
| 1128 |
+
"page_width_mm": width_mm,
|
| 1129 |
+
"page_height_mm": height_mm,
|
| 1130 |
+
"handwriting_regions": handwriting_regions,
|
| 1131 |
+
"visual_elements": visual_elements,
|
| 1132 |
+
"has_stage3_image": final_image_b64 is not None,
|
| 1133 |
+
"ocr_results": ocr_results,
|
| 1134 |
+
# Stage 5 results
|
| 1135 |
+
"normalized_bboxes_word": stage5_results.get('normalized_bboxes_word'),
|
| 1136 |
+
"normalized_bboxes_segment": stage5_results.get('normalized_bboxes_segment'),
|
| 1137 |
+
"gt_verification": stage5_results.get('gt_verification'),
|
| 1138 |
+
"analysis_stats": stage5_results.get('analysis_stats'),
|
| 1139 |
+
"debug_visualization_available": stage5_results.get('debug_visualization') is not None
|
| 1140 |
+
})
|
| 1141 |
+
|
| 1142 |
+
except Exception as e:
|
| 1143 |
+
print(f"Error processing document {idx}: {str(e)}")
|
| 1144 |
+
# Continue with other documents
|
| 1145 |
+
continue
|
| 1146 |
+
|
| 1147 |
+
if not pdf_files:
|
| 1148 |
+
raise HTTPException(
|
| 1149 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 1150 |
+
detail="Failed to process any documents"
|
| 1151 |
+
)
|
| 1152 |
+
|
| 1153 |
+
print(f"Successfully generated {len(pdf_files)} documents")
|
| 1154 |
+
|
| 1155 |
+
# Finalize dataset export (writes metadata.json and README.md)
|
| 1156 |
+
print(f"📦 Finalizing dataset export...")
|
| 1157 |
+
exporter.finalize(
|
| 1158 |
+
request_id=request_id if request_id else "unnamed",
|
| 1159 |
+
user_id=user_id,
|
| 1160 |
+
prompt_params=request.prompt_params.dict(),
|
| 1161 |
+
api_mode="sync"
|
| 1162 |
+
)
|
| 1163 |
+
print(f"✓ Dataset structure finalized at {exporter.base_path}")
|
| 1164 |
+
|
| 1165 |
+
# Update status: Zipping
|
| 1166 |
+
if supabase_enabled:
|
| 1167 |
+
try:
|
| 1168 |
+
supabase_client.update_request_status(request_id, "zipping")
|
| 1169 |
+
print(f"[Request {request_id}] Status: zipping (creating ZIP archive)")
|
| 1170 |
+
except Exception as e:
|
| 1171 |
+
print(f"Warning: Status update failed: {e}")
|
| 1172 |
+
|
| 1173 |
+
# Create ZIP from organized dataset
|
| 1174 |
+
print(f"📦 Creating ZIP archive from dataset...")
|
| 1175 |
+
zip_buffer = io.BytesIO()
|
| 1176 |
+
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
| 1177 |
+
# Add all files from exporter.base_path
|
| 1178 |
+
for file_path in exporter.base_path.rglob('*'):
|
| 1179 |
+
if file_path.is_file():
|
| 1180 |
+
arcname = file_path.relative_to(exporter.base_path.parent)
|
| 1181 |
+
zip_file.write(file_path, arcname)
|
| 1182 |
+
|
| 1183 |
+
zip_buffer.seek(0)
|
| 1184 |
+
zip_size_mb = len(zip_buffer.getvalue()) / (1024 * 1024)
|
| 1185 |
+
print(f"✓ ZIP created: {zip_size_mb:.2f} MB")
|
| 1186 |
+
|
| 1187 |
+
# Update status: Completed
|
| 1188 |
+
if supabase_enabled and request_id:
|
| 1189 |
+
try:
|
| 1190 |
+
from .supabase_client import supabase_client
|
| 1191 |
+
supabase_client.update_request_status(request_id, "completed")
|
| 1192 |
+
print(f"[Request {request_id}] Status: completed")
|
| 1193 |
+
except Exception as e:
|
| 1194 |
+
print(f"[Request {request_id}] ⚠ Supabase update failed: {e}")
|
| 1195 |
+
|
| 1196 |
+
# Save ZIP to temporary file for background upload
|
| 1197 |
+
temp_zip_path = pathlib.Path(tempfile.gettempdir()) / f"docgenie_{request_id}.zip"
|
| 1198 |
+
temp_zip_path.write_bytes(zip_buffer.getvalue())
|
| 1199 |
+
|
| 1200 |
+
# Schedule background task: Upload to Google Drive
|
| 1201 |
+
if gdrive_enabled and request_id and request.google_drive_token:
|
| 1202 |
+
# Update status: Uploading
|
| 1203 |
+
try:
|
| 1204 |
+
supabase_client.update_request_status(request_id, "uploading")
|
| 1205 |
+
print(f"[Request {request_id}] Status: uploading (uploading to Google Drive)")
|
| 1206 |
+
except Exception as e:
|
| 1207 |
+
print(f"Warning: Status update failed: {e}")
|
| 1208 |
+
|
| 1209 |
+
print(f"[Request {request_id}] Scheduling GDrive upload in background...")
|
| 1210 |
+
|
| 1211 |
+
background_tasks.add_task(
|
| 1212 |
+
upload_zip_to_gdrive_background,
|
| 1213 |
+
request_id=request_id,
|
| 1214 |
+
zip_path=temp_zip_path,
|
| 1215 |
+
access_token=request.google_drive_token,
|
| 1216 |
+
refresh_token=request.google_drive_refresh_token,
|
| 1217 |
+
num_documents=len(pdf_files)
|
| 1218 |
+
)
|
| 1219 |
+
|
| 1220 |
+
# Save files for Supabase background upload
|
| 1221 |
+
if supabase_enabled:
|
| 1222 |
+
import shutil
|
| 1223 |
+
supabase_temp_dir = pathlib.Path(tempfile.gettempdir()) / f"docgenie_supabase_{request_id}"
|
| 1224 |
+
if supabase_temp_dir.exists():
|
| 1225 |
+
shutil.rmtree(supabase_temp_dir, ignore_errors=True)
|
| 1226 |
+
|
| 1227 |
+
# Copy exporter base_path to persistent temp dir
|
| 1228 |
+
shutil.copytree(exporter.base_path, supabase_temp_dir)
|
| 1229 |
+
|
| 1230 |
+
print(f"[Request {request_id}] Scheduling Supabase document upload in background...")
|
| 1231 |
+
background_tasks.add_task(
|
| 1232 |
+
upload_documents_to_supabase_background,
|
| 1233 |
+
request_id=request_id,
|
| 1234 |
+
user_id=str(user_id),
|
| 1235 |
+
temp_dir=str(supabase_temp_dir),
|
| 1236 |
+
num_documents=len(exporter.documents),
|
| 1237 |
+
model_version=settings.LLM_MODEL,
|
| 1238 |
+
zip_path=str(temp_zip_path) if 'temp_zip_path' in locals() else None
|
| 1239 |
+
)
|
| 1240 |
+
|
| 1241 |
+
# Prepare response headers with tracking info
|
| 1242 |
+
headers = {
|
| 1243 |
+
"Content-Disposition": f"attachment; filename=docgenie_documents_{uuid.uuid4().hex[:8]}.zip"
|
| 1244 |
+
}
|
| 1245 |
+
|
| 1246 |
+
# Add tracking header if Supabase enabled
|
| 1247 |
+
if supabase_enabled and request_id:
|
| 1248 |
+
headers["X-Request-ID"] = request_id
|
| 1249 |
+
headers["X-Status-URL"] = f"/jobs/{request_id}/status"
|
| 1250 |
+
print(f"[Request {request_id}] Returning ZIP with tracking headers")
|
| 1251 |
+
|
| 1252 |
+
return StreamingResponse(
|
| 1253 |
+
zip_buffer,
|
| 1254 |
+
media_type="application/zip",
|
| 1255 |
+
headers=headers
|
| 1256 |
+
)
|
| 1257 |
+
|
| 1258 |
+
except HTTPException as e:
|
| 1259 |
+
# Update status to failed if Supabase enabled
|
| 1260 |
+
if supabase_enabled and request_id:
|
| 1261 |
+
try:
|
| 1262 |
+
from .supabase_client import supabase_client
|
| 1263 |
+
supabase_client.update_request_status(request_id, "failed", error_message=str(e.detail))
|
| 1264 |
+
print(f"[Request {request_id}] Status: failed - {e.detail}")
|
| 1265 |
+
except Exception as update_error:
|
| 1266 |
+
print(f"Warning: Status update failed: {update_error}")
|
| 1267 |
+
raise
|
| 1268 |
+
except Exception as e:
|
| 1269 |
+
# Update status to failed if Supabase enabled
|
| 1270 |
+
if supabase_enabled and request_id:
|
| 1271 |
+
try:
|
| 1272 |
+
from .supabase_client import supabase_client
|
| 1273 |
+
supabase_client.update_request_status(request_id, "failed", error_message=str(e))
|
| 1274 |
+
print(f"[Request {request_id}] Status: failed - {str(e)}")
|
| 1275 |
+
except Exception as sup_err:
|
| 1276 |
+
print(f"[Request {request_id}] ⚠ Supabase update failed: {sup_err}")
|
| 1277 |
+
print(f"Unexpected error: {str(e)}")
|
| 1278 |
+
raise HTTPException(
|
| 1279 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 1280 |
+
detail=f"Internal server error: {str(e)}"
|
| 1281 |
+
)
|
| 1282 |
+
|
| 1283 |
+
|
| 1284 |
+
# ==================== Background Task Functions ====================
|
| 1285 |
+
|
| 1286 |
+
def upload_documents_to_supabase_background(
|
| 1287 |
+
request_id: str,
|
| 1288 |
+
user_id: str,
|
| 1289 |
+
temp_dir: str,
|
| 1290 |
+
num_documents: int,
|
| 1291 |
+
model_version: str,
|
| 1292 |
+
zip_path: Optional[str] = None
|
| 1293 |
+
):
|
| 1294 |
+
"""
|
| 1295 |
+
Background task to upload individual documents to Supabase Storage.
|
| 1296 |
+
"""
|
| 1297 |
+
import shutil
|
| 1298 |
+
import pathlib
|
| 1299 |
+
import traceback
|
| 1300 |
+
|
| 1301 |
+
try:
|
| 1302 |
+
print(f"[Background Task {request_id}] Starting Supabase individual document upload...")
|
| 1303 |
+
from .supabase_client import supabase_client
|
| 1304 |
+
|
| 1305 |
+
base_path = pathlib.Path(temp_dir)
|
| 1306 |
+
|
| 1307 |
+
# Upload zip if provided
|
| 1308 |
+
zip_url = None
|
| 1309 |
+
if zip_path and pathlib.Path(zip_path).exists():
|
| 1310 |
+
zip_file = pathlib.Path(zip_path)
|
| 1311 |
+
zip_storage_path = f"{user_id}/{request_id}/generated/docgenie_{request_id}.zip"
|
| 1312 |
+
supabase_client.upload_to_storage("doc_storage", zip_storage_path, zip_file.read_bytes(), "application/zip")
|
| 1313 |
+
zip_url = supabase_client.get_public_url("doc_storage", zip_storage_path)
|
| 1314 |
+
print(f"[Background Task {request_id}] ✓ Uploaded ZIP to Supabase: {zip_url}")
|
| 1315 |
+
|
| 1316 |
+
for idx in range(num_documents):
|
| 1317 |
+
doc_id = f"document_{idx + 1}"
|
| 1318 |
+
|
| 1319 |
+
# Paths to upload
|
| 1320 |
+
doc_storage_path = f"{user_id}/{request_id}/generated/{idx}_doc.pdf"
|
| 1321 |
+
gt_storage_path = f"{user_id}/{request_id}/generated/{idx}_gt.json"
|
| 1322 |
+
html_storage_path = f"{user_id}/{request_id}/generated/{idx}_src.html"
|
| 1323 |
+
bbox_storage_path = f"{user_id}/{request_id}/generated/{idx}_bbox.json"
|
| 1324 |
+
|
| 1325 |
+
# Local paths
|
| 1326 |
+
local_pdf = base_path / "pdf" / "pdf_final" / f"{doc_id}.pdf"
|
| 1327 |
+
if not local_pdf.exists():
|
| 1328 |
+
local_pdf = base_path / "pdf" / "pdf_initial" / f"{doc_id}.pdf"
|
| 1329 |
+
|
| 1330 |
+
local_gt = base_path / "annotations" / "gt" / f"{doc_id}.json"
|
| 1331 |
+
local_html = base_path / "html" / f"{doc_id}.html"
|
| 1332 |
+
local_bbox = base_path / "bbox" / "bbox_final" / "word" / f"{doc_id}.json"
|
| 1333 |
+
|
| 1334 |
+
# Upload files
|
| 1335 |
+
pdf_url = None
|
| 1336 |
+
if local_pdf.exists():
|
| 1337 |
+
supabase_client.upload_to_storage("doc_storage", doc_storage_path, local_pdf.read_bytes(), "application/pdf")
|
| 1338 |
+
pdf_url = supabase_client.get_public_url("doc_storage", doc_storage_path)
|
| 1339 |
+
|
| 1340 |
+
if local_gt.exists():
|
| 1341 |
+
supabase_client.upload_to_storage("doc_storage", gt_storage_path, local_gt.read_bytes(), "application/json")
|
| 1342 |
+
|
| 1343 |
+
if local_html.exists():
|
| 1344 |
+
supabase_client.upload_to_storage("doc_storage", html_storage_path, local_html.read_bytes(), "text/html")
|
| 1345 |
+
|
| 1346 |
+
if local_bbox.exists():
|
| 1347 |
+
supabase_client.upload_to_storage("doc_storage", bbox_storage_path, local_bbox.read_bytes(), "application/json")
|
| 1348 |
+
|
| 1349 |
+
supabase_client.create_generated_document(
|
| 1350 |
+
request_id=request_id,
|
| 1351 |
+
file_url=pdf_url,
|
| 1352 |
+
file_type="application/pdf" if pdf_url else None,
|
| 1353 |
+
model_version=model_version,
|
| 1354 |
+
doc_index=idx,
|
| 1355 |
+
doc_storage_path=doc_storage_path if local_pdf.exists() else None,
|
| 1356 |
+
gt_storage_path=gt_storage_path if local_gt.exists() else None,
|
| 1357 |
+
html_storage_path=html_storage_path if local_html.exists() else None,
|
| 1358 |
+
bbox_storage_path=bbox_storage_path if local_bbox.exists() else None,
|
| 1359 |
+
zip_url=zip_url
|
| 1360 |
+
)
|
| 1361 |
+
print(f"[Background Task {request_id}] ✓ Uploaded and tracked document {idx}")
|
| 1362 |
+
|
| 1363 |
+
except Exception as e:
|
| 1364 |
+
print(f"[Background Task {request_id}] ⚠ Supabase upload failed: {str(e)}")
|
| 1365 |
+
traceback.print_exc()
|
| 1366 |
+
finally:
|
| 1367 |
+
try:
|
| 1368 |
+
# Clean up temporary directory
|
| 1369 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 1370 |
+
print(f"[Background Task {request_id}] ✓ Cleaned up temporary directory {temp_dir}")
|
| 1371 |
+
except Exception as e:
|
| 1372 |
+
print(f"[Background Task {request_id}] ⚠ Failed to clean up temp dir: {e}")
|
| 1373 |
+
|
| 1374 |
+
def upload_zip_to_gdrive_background(
|
| 1375 |
+
request_id: str,
|
| 1376 |
+
zip_path: pathlib.Path,
|
| 1377 |
+
access_token: str,
|
| 1378 |
+
refresh_token: Optional[str],
|
| 1379 |
+
num_documents: int
|
| 1380 |
+
):
|
| 1381 |
+
"""
|
| 1382 |
+
Background task to upload ZIP file to Google Drive.
|
| 1383 |
+
|
| 1384 |
+
Args:
|
| 1385 |
+
request_id: Supabase request ID
|
| 1386 |
+
zip_path: Path to temporary ZIP file
|
| 1387 |
+
access_token: Google Drive OAuth access token
|
| 1388 |
+
refresh_token: Google Drive refresh token (optional)
|
| 1389 |
+
num_documents: Number of documents in ZIP
|
| 1390 |
+
"""
|
| 1391 |
+
try:
|
| 1392 |
+
print(f"[Background Task {request_id}] Starting GDrive upload...")
|
| 1393 |
+
|
| 1394 |
+
from .google_drive import GoogleDriveClient
|
| 1395 |
+
from .supabase_client import supabase_client
|
| 1396 |
+
|
| 1397 |
+
# Upload to Google Drive
|
| 1398 |
+
client = GoogleDriveClient(
|
| 1399 |
+
access_token=access_token,
|
| 1400 |
+
refresh_token=refresh_token
|
| 1401 |
+
)
|
| 1402 |
+
|
| 1403 |
+
filename = f"docgenie_{request_id}.zip"
|
| 1404 |
+
gdrive_url = client.upload_file(
|
| 1405 |
+
file_path=zip_path,
|
| 1406 |
+
filename=filename,
|
| 1407 |
+
folder_name=settings.GOOGLE_DRIVE_FOLDER_NAME,
|
| 1408 |
+
mime_type="application/zip"
|
| 1409 |
+
)
|
| 1410 |
+
|
| 1411 |
+
print(f"[Background Task {request_id}] ✓ Uploaded to GDrive: {gdrive_url}")
|
| 1412 |
+
|
| 1413 |
+
supabase_client.create_generated_document(
|
| 1414 |
+
request_id=request_id,
|
| 1415 |
+
file_url=gdrive_url,
|
| 1416 |
+
file_type="application/zip",
|
| 1417 |
+
model_version=settings.LLM_MODEL
|
| 1418 |
+
)
|
| 1419 |
+
|
| 1420 |
+
print(f"[Background Task {request_id}] ✓ Updated Supabase with GDrive URL")
|
| 1421 |
+
|
| 1422 |
+
# Update status to completed
|
| 1423 |
+
supabase_client.update_request_status(request_id, "completed")
|
| 1424 |
+
print(f"[Background Task {request_id}] ✓ Status updated to completed")
|
| 1425 |
+
|
| 1426 |
+
# Clean up temporary file
|
| 1427 |
+
zip_path.unlink(missing_ok=True)
|
| 1428 |
+
print(f"[Background Task {request_id}] ✓ Cleaned up temp file")
|
| 1429 |
+
|
| 1430 |
+
except Exception as e:
|
| 1431 |
+
print(f"[Background Task {request_id}] ✗ GDrive upload failed: {str(e)}")
|
| 1432 |
+
import traceback
|
| 1433 |
+
traceback.print_exc()
|
| 1434 |
+
|
| 1435 |
+
# Update status to completed_gdrive_failed since token was provided
|
| 1436 |
+
try:
|
| 1437 |
+
from .supabase_client import supabase_client
|
| 1438 |
+
supabase_client.update_request_status(request_id, "completed_gdrive_failed")
|
| 1439 |
+
print(f"[Background Task {request_id}] Status updated to completed_gdrive_failed")
|
| 1440 |
+
except Exception as status_err:
|
| 1441 |
+
print(f"[Background Task {request_id}] Failed to update status: {status_err}")
|
| 1442 |
+
|
| 1443 |
+
# Clean up temp file even if upload failed
|
| 1444 |
+
try:
|
| 1445 |
+
zip_path.unlink(missing_ok=True)
|
| 1446 |
+
except Exception:
|
| 1447 |
+
pass
|
| 1448 |
+
|
| 1449 |
+
|
| 1450 |
+
# ==================== New Async Endpoints (Batched API) ====================
|
| 1451 |
+
|
| 1452 |
+
from redis import Redis
|
| 1453 |
+
from rq import Queue
|
| 1454 |
+
from rq.job import Job
|
| 1455 |
+
from .supabase_client import supabase_client
|
| 1456 |
+
from .worker import process_document_generation_job
|
| 1457 |
+
|
| 1458 |
+
|
| 1459 |
+
# Initialize Redis and RQ
|
| 1460 |
+
try:
|
| 1461 |
+
redis_conn = Redis.from_url(settings.REDIS_URL)
|
| 1462 |
+
job_queue = Queue(settings.RQ_QUEUE_NAME, connection=redis_conn)
|
| 1463 |
+
print(f"✓ Connected to Redis: {settings.REDIS_URL}")
|
| 1464 |
+
print(f"✓ RQ Queue: {settings.RQ_QUEUE_NAME}")
|
| 1465 |
+
except Exception as e:
|
| 1466 |
+
print(f"⚠ Warning: Redis connection failed: {e}")
|
| 1467 |
+
print(" Async endpoints will not work without Redis")
|
| 1468 |
+
redis_conn = None
|
| 1469 |
+
job_queue = None
|
| 1470 |
+
|
| 1471 |
+
|
| 1472 |
+
@app.post("/generate/async")
|
| 1473 |
+
async def generate_documents_async(request: GenerateDocumentRequest):
|
| 1474 |
+
"""
|
| 1475 |
+
Generate synthetic documents asynchronously using batched Claude API.
|
| 1476 |
+
|
| 1477 |
+
**Workflow:**
|
| 1478 |
+
1. Frontend creates document_requests entry in Supabase with status="pending"
|
| 1479 |
+
2. Frontend sends request_id to this endpoint along with tokens and seed images
|
| 1480 |
+
3. API fetches existing request, validates, and enqueues background job
|
| 1481 |
+
4. API returns immediately with job info
|
| 1482 |
+
5. Background worker processes job and updates status: processing → generating → completed/failed
|
| 1483 |
+
6. User polls /jobs/{request_id}/status for progress
|
| 1484 |
+
7. Upon completion, ZIP is automatically uploaded to Google Drive
|
| 1485 |
+
|
| 1486 |
+
Uses batched Claude API for 50% cost savings (but takes 5-30 minutes).
|
| 1487 |
+
|
| 1488 |
+
Request body:
|
| 1489 |
+
- request_id: UUID of existing document_requests entry (required)
|
| 1490 |
+
- seed_images: List[str] (Supabase storage URLs) (required)
|
| 1491 |
+
- google_drive_token: OAuth token for GDrive upload (optional)
|
| 1492 |
+
- google_drive_refresh_token: Refresh token for GDrive (optional)
|
| 1493 |
+
- prompt_params: dict (language, doc_type, num_solutions, etc.)
|
| 1494 |
+
|
| 1495 |
+
Returns:
|
| 1496 |
+
- request_id: UUID to track job
|
| 1497 |
+
- status: "pending"
|
| 1498 |
+
- estimated_time_minutes: int
|
| 1499 |
+
- poll_url: URL to check status
|
| 1500 |
+
"""
|
| 1501 |
+
if not job_queue:
|
| 1502 |
+
raise HTTPException(
|
| 1503 |
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 1504 |
+
detail="Background job queue not available. Redis connection required."
|
| 1505 |
+
)
|
| 1506 |
+
|
| 1507 |
+
# Get request_id from request
|
| 1508 |
+
user_id_from_input, request_id = parse_request_id(request.request_id)
|
| 1509 |
+
user_id = user_id_from_input
|
| 1510 |
+
|
| 1511 |
+
try:
|
| 1512 |
+
# Fetch request from Supabase
|
| 1513 |
+
existing_request = supabase_client.get_request(request_id)
|
| 1514 |
+
if not existing_request:
|
| 1515 |
+
raise HTTPException(
|
| 1516 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 1517 |
+
detail=f"Request {request_id} not found in database"
|
| 1518 |
+
)
|
| 1519 |
+
|
| 1520 |
+
# Use user_id from input if available, otherwise from database
|
| 1521 |
+
if not user_id:
|
| 1522 |
+
user_id = existing_request["user_id"]
|
| 1523 |
+
|
| 1524 |
+
print(f"[Request {request_id}] Processing async request for user {user_id}")
|
| 1525 |
+
print(f"[Request {request_id}] Current status: {existing_request['status']}")
|
| 1526 |
+
|
| 1527 |
+
|
| 1528 |
+
|
| 1529 |
+
# Validate seed images
|
| 1530 |
+
if not request.seed_images:
|
| 1531 |
+
raise HTTPException(
|
| 1532 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 1533 |
+
detail="At least one seed image is required"
|
| 1534 |
+
)
|
| 1535 |
+
|
| 1536 |
+
# Update status to processing (job is being queued)
|
| 1537 |
+
supabase_client.update_request_status(request_id, "processing")
|
| 1538 |
+
print(f"[Request {request_id}] Status: processing (queuing job)")
|
| 1539 |
+
|
| 1540 |
+
# Prepare job data
|
| 1541 |
+
job_data = {
|
| 1542 |
+
"user_id": user_id,
|
| 1543 |
+
"google_drive_token": request.google_drive_token,
|
| 1544 |
+
"google_drive_refresh_token": request.google_drive_refresh_token,
|
| 1545 |
+
"seed_images": [str(url) for url in request.seed_images],
|
| 1546 |
+
"prompt_params": request.prompt_params.dict()
|
| 1547 |
+
}
|
| 1548 |
+
|
| 1549 |
+
# Enqueue background job
|
| 1550 |
+
job = job_queue.enqueue(
|
| 1551 |
+
process_document_generation_job,
|
| 1552 |
+
request_id=request_id,
|
| 1553 |
+
request_data=job_data,
|
| 1554 |
+
job_timeout='2h', # 2 hours max (batched API can take time)
|
| 1555 |
+
result_ttl=86400, # Keep result for 24 hours
|
| 1556 |
+
failure_ttl=86400 # Keep failure info for 24 hours
|
| 1557 |
+
)
|
| 1558 |
+
|
| 1559 |
+
print(f"Enqueued job {job.id} for request {request_id}")
|
| 1560 |
+
|
| 1561 |
+
# Estimate time based on num_solutions
|
| 1562 |
+
num_solutions = request.prompt_params.num_solutions
|
| 1563 |
+
if num_solutions <= 3:
|
| 1564 |
+
estimated_time = 10 # ~10 minutes for small batch
|
| 1565 |
+
elif num_solutions <= 10:
|
| 1566 |
+
estimated_time = 20 # ~20 minutes for medium batch
|
| 1567 |
+
else:
|
| 1568 |
+
estimated_time = 30 + (num_solutions - 10) * 2 # Scale up
|
| 1569 |
+
|
| 1570 |
+
# Log analytics
|
| 1571 |
+
supabase_client.log_analytics_event(
|
| 1572 |
+
user_id=user_id,
|
| 1573 |
+
event_type="document_generation_requested",
|
| 1574 |
+
entity_id=request_id
|
| 1575 |
+
)
|
| 1576 |
+
|
| 1577 |
+
return {
|
| 1578 |
+
"request_id": request_id,
|
| 1579 |
+
"status": "pending",
|
| 1580 |
+
"estimated_time_minutes": estimated_time,
|
| 1581 |
+
"num_documents": num_solutions,
|
| 1582 |
+
"poll_url": f"/jobs/{request_id}/status",
|
| 1583 |
+
"message": f"Job queued successfully. Check status at /jobs/{request_id}/status"
|
| 1584 |
+
}
|
| 1585 |
+
|
| 1586 |
+
except HTTPException as http_exc:
|
| 1587 |
+
# Update status to failed
|
| 1588 |
+
try:
|
| 1589 |
+
supabase_client.update_request_status(request_id, "failed", error_message=str(http_exc.detail))
|
| 1590 |
+
print(f"[Request {request_id}] Status: failed - {http_exc.detail}")
|
| 1591 |
+
except Exception as update_error:
|
| 1592 |
+
print(f"Warning: Status update failed: {update_error}")
|
| 1593 |
+
raise
|
| 1594 |
+
except Exception as e:
|
| 1595 |
+
print(f"Error creating async job: {str(e)}")
|
| 1596 |
+
import traceback
|
| 1597 |
+
traceback.print_exc()
|
| 1598 |
+
|
| 1599 |
+
# Update status to failed
|
| 1600 |
+
try:
|
| 1601 |
+
supabase_client.update_request_status(request_id, "failed", error_message=str(e))
|
| 1602 |
+
print(f"[Request {request_id}] Status: failed - {str(e)}")
|
| 1603 |
+
except Exception as update_error:
|
| 1604 |
+
print(f"Warning: Status update failed: {update_error}")
|
| 1605 |
+
|
| 1606 |
+
raise HTTPException(
|
| 1607 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 1608 |
+
detail=f"Failed to create job: {str(e)}"
|
| 1609 |
+
)
|
| 1610 |
+
finally:
|
| 1611 |
+
# Clean up assets directory if it exists
|
| 1612 |
+
if 'assets_temp_dir' in locals() and assets_temp_dir and assets_temp_dir.exists():
|
| 1613 |
+
try:
|
| 1614 |
+
shutil.rmtree(assets_temp_dir, ignore_errors=True)
|
| 1615 |
+
print(f"[Request {request_id}] ✓ Cleaned up assets directory {assets_temp_dir}")
|
| 1616 |
+
except:
|
| 1617 |
+
pass
|
| 1618 |
+
|
| 1619 |
+
|
| 1620 |
+
@app.get("/jobs/{request_id}/status")
|
| 1621 |
+
async def get_job_status(request_id: str):
|
| 1622 |
+
"""
|
| 1623 |
+
Get status of a document generation job.
|
| 1624 |
+
|
| 1625 |
+
Returns:
|
| 1626 |
+
- request_id: UUID
|
| 1627 |
+
- status: pending | processing | generating | completed | failed
|
| 1628 |
+
- created_at: ISO timestamp
|
| 1629 |
+
- updated_at: ISO timestamp
|
| 1630 |
+
- error_message: str (if failed)
|
| 1631 |
+
- results: dict with download_url (if completed)
|
| 1632 |
+
"""
|
| 1633 |
+
try:
|
| 1634 |
+
# Get request from Supabase
|
| 1635 |
+
request_data = supabase_client.get_request(request_id)
|
| 1636 |
+
|
| 1637 |
+
if not request_data:
|
| 1638 |
+
raise HTTPException(
|
| 1639 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 1640 |
+
detail=f"Request {request_id} not found"
|
| 1641 |
+
)
|
| 1642 |
+
|
| 1643 |
+
response = {
|
| 1644 |
+
"request_id": request_id,
|
| 1645 |
+
"status": request_data["status"],
|
| 1646 |
+
"created_at": request_data["created_at"],
|
| 1647 |
+
"updated_at": request_data["updated_at"],
|
| 1648 |
+
"num_documents": request_data["metadata"]["prompt_params"]["num_solutions"]
|
| 1649 |
+
}
|
| 1650 |
+
|
| 1651 |
+
# Add error message if failed
|
| 1652 |
+
if request_data["status"] == "failed":
|
| 1653 |
+
response["error_message"] = request_data.get("error_message")
|
| 1654 |
+
|
| 1655 |
+
# Add result URL if completed
|
| 1656 |
+
if request_data["status"] == "completed":
|
| 1657 |
+
# Get generated documents
|
| 1658 |
+
generated_docs = supabase_client.get_generated_documents(request_id)
|
| 1659 |
+
|
| 1660 |
+
if generated_docs:
|
| 1661 |
+
response["results"] = {
|
| 1662 |
+
"documents": [
|
| 1663 |
+
{
|
| 1664 |
+
"id": doc.get("id"),
|
| 1665 |
+
"doc_index": doc.get("doc_index"),
|
| 1666 |
+
"pdf_url": doc.get("file_url"),
|
| 1667 |
+
"doc_storage_path": doc.get("doc_storage_path"),
|
| 1668 |
+
"gt_storage_path": doc.get("gt_storage_path"),
|
| 1669 |
+
"html_storage_path": doc.get("html_storage_path"),
|
| 1670 |
+
"bbox_storage_path": doc.get("bbox_storage_path")
|
| 1671 |
+
} for doc in generated_docs if doc.get("doc_index") is not None
|
| 1672 |
+
],
|
| 1673 |
+
"zip_filename": f"docgenie_{request_id}.zip"
|
| 1674 |
+
}
|
| 1675 |
+
|
| 1676 |
+
# If there's a zip file (legacy or background GDrive task), add it too
|
| 1677 |
+
zip_docs = [doc for doc in generated_docs if doc.get("file_type") == "application/zip"]
|
| 1678 |
+
if zip_docs:
|
| 1679 |
+
response["results"]["download_url"] = zip_docs[0].get("file_url")
|
| 1680 |
+
|
| 1681 |
+
return response
|
| 1682 |
+
|
| 1683 |
+
except HTTPException:
|
| 1684 |
+
raise
|
| 1685 |
+
except Exception as e:
|
| 1686 |
+
print(f"Error fetching job status: {str(e)}")
|
| 1687 |
+
raise HTTPException(
|
| 1688 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 1689 |
+
detail=f"Failed to fetch job status: {str(e)}"
|
| 1690 |
+
)
|
| 1691 |
+
|
| 1692 |
+
|
| 1693 |
+
@app.get("/jobs/user/{user_id}")
|
| 1694 |
+
async def get_user_jobs(user_id: int, limit: int = 50, offset: int = 0):
|
| 1695 |
+
"""
|
| 1696 |
+
Get all jobs for a user.
|
| 1697 |
+
|
| 1698 |
+
Query params:
|
| 1699 |
+
- limit: int (default: 50, max: 100)
|
| 1700 |
+
- offset: int (default: 0)
|
| 1701 |
+
|
| 1702 |
+
Returns:
|
| 1703 |
+
List of job status objects
|
| 1704 |
+
"""
|
| 1705 |
+
try:
|
| 1706 |
+
# Validate limit
|
| 1707 |
+
if limit > 100:
|
| 1708 |
+
limit = 100
|
| 1709 |
+
|
| 1710 |
+
# Get user's requests from Supabase
|
| 1711 |
+
requests = supabase_client.get_user_requests(user_id, limit, offset)
|
| 1712 |
+
|
| 1713 |
+
results = []
|
| 1714 |
+
for request_data in requests:
|
| 1715 |
+
result = {
|
| 1716 |
+
"request_id": request_data["id"],
|
| 1717 |
+
"status": request_data["status"],
|
| 1718 |
+
"created_at": request_data["created_at"],
|
| 1719 |
+
"updated_at": request_data["updated_at"],
|
| 1720 |
+
"num_documents": request_data["metadata"]["prompt_params"]["num_solutions"]
|
| 1721 |
+
}
|
| 1722 |
+
|
| 1723 |
+
if request_data["status"] == "failed":
|
| 1724 |
+
result["error_message"] = request_data.get("error_message")
|
| 1725 |
+
|
| 1726 |
+
if request_data["status"] == "completed":
|
| 1727 |
+
# Get generated documents
|
| 1728 |
+
generated_docs = supabase_client.get_generated_documents(request_data["id"])
|
| 1729 |
+
if generated_docs:
|
| 1730 |
+
result["download_url"] = generated_docs[0]["file_url"]
|
| 1731 |
+
|
| 1732 |
+
results.append(result)
|
| 1733 |
+
|
| 1734 |
+
return {
|
| 1735 |
+
"user_id": user_id,
|
| 1736 |
+
"jobs": results,
|
| 1737 |
+
"count": len(results),
|
| 1738 |
+
"limit": limit,
|
| 1739 |
+
"offset": offset
|
| 1740 |
+
}
|
| 1741 |
+
|
| 1742 |
+
except Exception as e:
|
| 1743 |
+
print(f"Error fetching user jobs: {str(e)}")
|
| 1744 |
+
raise HTTPException(
|
| 1745 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 1746 |
+
detail=f"Failed to fetch user jobs: {str(e)}"
|
| 1747 |
+
)
|
| 1748 |
+
|
| 1749 |
+
|
| 1750 |
+
if __name__ == "__main__":
|
| 1751 |
+
uvicorn.run(
|
| 1752 |
+
"main:app",
|
| 1753 |
+
host=settings.API_HOST,
|
| 1754 |
+
port=settings.API_PORT,
|
| 1755 |
+
reload=settings.DEBUG_MODE
|
| 1756 |
+
)
|
api/quick_test.sh
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Quick test script - tests async API with Google Drive upload
|
| 3 |
+
# Usage: ./quick_test.sh YOUR_GOOGLE_ACCESS_TOKEN
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
GOOGLE_TOKEN=$1
|
| 8 |
+
BASE_URL=${2:-"http://localhost:8000"}
|
| 9 |
+
|
| 10 |
+
if [ -z "$GOOGLE_TOKEN" ]; then
|
| 11 |
+
echo "Usage: ./quick_test.sh YOUR_GOOGLE_ACCESS_TOKEN [BASE_URL]"
|
| 12 |
+
echo ""
|
| 13 |
+
echo "To get a Google token, run:"
|
| 14 |
+
echo " python test_get_google_token.py --client-id YOUR_ID --client-secret YOUR_SECRET"
|
| 15 |
+
echo ""
|
| 16 |
+
echo "Or see TESTING.md for detailed instructions"
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
echo "==========================================="
|
| 21 |
+
echo "Quick Test: Async API + Google Drive"
|
| 22 |
+
echo "==========================================="
|
| 23 |
+
echo "API: $BASE_URL"
|
| 24 |
+
echo "Token: ${GOOGLE_TOKEN:0:20}..."
|
| 25 |
+
echo ""
|
| 26 |
+
|
| 27 |
+
# Step 1: Health check
|
| 28 |
+
echo "1. Health Check..."
|
| 29 |
+
curl -s "$BASE_URL/health" | python -m json.tool
|
| 30 |
+
echo ""
|
| 31 |
+
|
| 32 |
+
# Step 2: Submit job
|
| 33 |
+
echo "2. Submitting Job..."
|
| 34 |
+
RESPONSE=$(curl -s -X POST "$BASE_URL/generate/async" \
|
| 35 |
+
-H "Content-Type: application/json" \
|
| 36 |
+
-d "{
|
| 37 |
+
\"user_id\": 1,
|
| 38 |
+
\"google_drive_token\": \"$GOOGLE_TOKEN\",
|
| 39 |
+
\"seed_images\": [\"https://ocr.space/Content/Images/receipt-ocr-original.webp\"],
|
| 40 |
+
\"prompt_params\": {
|
| 41 |
+
\"language\": \"English\",
|
| 42 |
+
\"doc_type\": \"receipts\",
|
| 43 |
+
\"num_solutions\": 1,
|
| 44 |
+
\"enable_handwriting\": false,
|
| 45 |
+
\"enable_visual_elements\": false,
|
| 46 |
+
\"output_detail\": \"minimal\"
|
| 47 |
+
}
|
| 48 |
+
}")
|
| 49 |
+
|
| 50 |
+
echo "$RESPONSE" | python -m json.tool
|
| 51 |
+
echo ""
|
| 52 |
+
|
| 53 |
+
REQUEST_ID=$(echo "$RESPONSE" | python -c "import sys, json; print(json.load(sys.stdin)['request_id'])" 2>/dev/null || echo "")
|
| 54 |
+
|
| 55 |
+
if [ -z "$REQUEST_ID" ]; then
|
| 56 |
+
echo "✗ Failed to submit job"
|
| 57 |
+
exit 1
|
| 58 |
+
fi
|
| 59 |
+
|
| 60 |
+
echo "✓ Job ID: $REQUEST_ID"
|
| 61 |
+
echo ""
|
| 62 |
+
|
| 63 |
+
# Step 3: Poll status
|
| 64 |
+
echo "3. Polling Status (will check 5 times, 10s apart)..."
|
| 65 |
+
for i in {1..5}; do
|
| 66 |
+
echo " Poll $i/5..."
|
| 67 |
+
STATUS=$(curl -s "$BASE_URL/jobs/$REQUEST_ID/status")
|
| 68 |
+
CURRENT_STATUS=$(echo "$STATUS" | python -c "import sys, json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unknown")
|
| 69 |
+
echo " Status: $CURRENT_STATUS"
|
| 70 |
+
|
| 71 |
+
if [ "$CURRENT_STATUS" = "completed" ]; then
|
| 72 |
+
echo ""
|
| 73 |
+
echo "✓ JOB COMPLETED!"
|
| 74 |
+
echo "$STATUS" | python -m json.tool
|
| 75 |
+
exit 0
|
| 76 |
+
elif [ "$CURRENT_STATUS" = "failed" ]; then
|
| 77 |
+
echo ""
|
| 78 |
+
echo "✗ JOB FAILED"
|
| 79 |
+
echo "$STATUS" | python -m json.tool
|
| 80 |
+
exit 1
|
| 81 |
+
fi
|
| 82 |
+
|
| 83 |
+
if [ $i -lt 5 ]; then
|
| 84 |
+
sleep 10
|
| 85 |
+
fi
|
| 86 |
+
done
|
| 87 |
+
|
| 88 |
+
echo ""
|
| 89 |
+
echo "⏱ Job still in progress. Continue polling manually:"
|
| 90 |
+
echo " curl $BASE_URL/jobs/$REQUEST_ID/status"
|
| 91 |
+
echo ""
|
| 92 |
+
echo "Or use the full test script:"
|
| 93 |
+
echo " python test_async_api.py --google-token $GOOGLE_TOKEN"
|
api/requirements.txt
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# DocGenie API Requirements
|
| 3 |
+
# ============================================
|
| 4 |
+
# NOTE: These dependencies are also specified in the root pyproject.toml
|
| 5 |
+
# This file exists for standalone API deployment convenience
|
| 6 |
+
# For development, use: uv sync (from root directory)
|
| 7 |
+
# For production API-only deployment: pip install -r requirements.txt
|
| 8 |
+
# Aligned with pyproject.toml versions used to run pipeline locally
|
| 9 |
+
|
| 10 |
+
# FastAPI Framework
|
| 11 |
+
fastapi>=0.109.0
|
| 12 |
+
uvicorn[standard]>=0.27.0
|
| 13 |
+
python-multipart>=0.0.6
|
| 14 |
+
|
| 15 |
+
# Pydantic for data validation
|
| 16 |
+
pydantic==2.11.7
|
| 17 |
+
pydantic-core==2.33.2
|
| 18 |
+
pydantic-settings>=2.11.0
|
| 19 |
+
|
| 20 |
+
# Environment variables
|
| 21 |
+
python-dotenv>=1.0.0
|
| 22 |
+
|
| 23 |
+
# HTTP client for async requests
|
| 24 |
+
httpx==0.28.1
|
| 25 |
+
aiohttp==3.12.15
|
| 26 |
+
|
| 27 |
+
# Retry logic for external services
|
| 28 |
+
tenacity>=8.2.3
|
| 29 |
+
|
| 30 |
+
# Claude API
|
| 31 |
+
anthropic==0.64.0
|
| 32 |
+
|
| 33 |
+
# HTML rendering and PDF generation
|
| 34 |
+
playwright>=1.55.0
|
| 35 |
+
beautifulsoup4==4.13.4
|
| 36 |
+
lxml>=5.1.0
|
| 37 |
+
|
| 38 |
+
# PDF processing
|
| 39 |
+
PyMuPDF==1.26.3
|
| 40 |
+
pdf2image==1.17.0
|
| 41 |
+
pypdf2==3.0.1
|
| 42 |
+
|
| 43 |
+
# Image processing for Stage 3
|
| 44 |
+
Pillow==11.3.0
|
| 45 |
+
numpy==1.26.4
|
| 46 |
+
|
| 47 |
+
# CSS parsing for Stage 3
|
| 48 |
+
cssutils==2.11.1
|
| 49 |
+
|
| 50 |
+
# Progress bars and logging
|
| 51 |
+
rich==14.1.0
|
| 52 |
+
|
| 53 |
+
# Additional utilities
|
| 54 |
+
python-dateutil==2.9.0.post0
|
| 55 |
+
requests==2.32.5
|
| 56 |
+
|
| 57 |
+
# Background job queue (Redis + RQ)
|
| 58 |
+
redis>=5.0.0
|
| 59 |
+
rq>=1.15.0
|
| 60 |
+
|
| 61 |
+
# Supabase client for database
|
| 62 |
+
supabase>=2.0.0
|
| 63 |
+
|
| 64 |
+
# Google Drive API integration
|
| 65 |
+
google-api-python-client>=2.100.0
|
| 66 |
+
google-auth-httplib2>=0.2.0
|
| 67 |
+
google-auth-oauthlib>=1.2.0
|
| 68 |
+
|
| 69 |
+
# ============================================
|
| 70 |
+
# Optional dependencies for advanced features
|
| 71 |
+
# ============================================
|
| 72 |
+
# OCR support (requires system tesseract-ocr)
|
| 73 |
+
pytesseract>=0.3.10
|
| 74 |
+
|
| 75 |
+
# Barcode generation
|
| 76 |
+
python-barcode>=0.15.1
|
| 77 |
+
|
| 78 |
+
# Dataset export in msgpack format
|
| 79 |
+
datadings>=0.4.3
|
api/schemas.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic schemas for API request/response models.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
from pydantic import BaseModel, HttpUrl, Field, field_validator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class PromptParameters(BaseModel):
|
| 9 |
+
"""Parameters for customizing the document generation prompt."""
|
| 10 |
+
language: str = Field(
|
| 11 |
+
default="English",
|
| 12 |
+
description="Language for generated documents"
|
| 13 |
+
)
|
| 14 |
+
doc_type: str = Field(
|
| 15 |
+
default="business and administrative",
|
| 16 |
+
description="Type of documents to generate (e.g., 'business and administrative', 'receipts', 'forms')"
|
| 17 |
+
)
|
| 18 |
+
gt_type: str = Field(
|
| 19 |
+
default="Multiple questions about each document, with their answers taken **verbatim** from the document.",
|
| 20 |
+
description="Description of ground truth type to generate"
|
| 21 |
+
)
|
| 22 |
+
gt_format: str = Field(
|
| 23 |
+
default='{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}',
|
| 24 |
+
description="Format specification for ground truth JSON"
|
| 25 |
+
)
|
| 26 |
+
num_solutions: int = Field(
|
| 27 |
+
default=1,
|
| 28 |
+
ge=1,
|
| 29 |
+
le=5,
|
| 30 |
+
description="Number of document variations to generate (1-5)"
|
| 31 |
+
)
|
| 32 |
+
# Stage 3: Feature Synthesis parameters
|
| 33 |
+
enable_handwriting: bool = Field(
|
| 34 |
+
default=False,
|
| 35 |
+
description="Enable handwriting generation (requires EC2 handwriting service)"
|
| 36 |
+
)
|
| 37 |
+
handwriting_ratio: float = Field(
|
| 38 |
+
default=0.2,
|
| 39 |
+
ge=0.0,
|
| 40 |
+
le=1.0,
|
| 41 |
+
description="Proportion of text to convert to handwriting (0.0-1.0)"
|
| 42 |
+
)
|
| 43 |
+
enable_visual_elements: bool = Field(
|
| 44 |
+
default=True,
|
| 45 |
+
description="Enable visual element generation (stamps, logos, barcodes)"
|
| 46 |
+
)
|
| 47 |
+
visual_element_types: List[str] = Field(
|
| 48 |
+
default=["stamp", "logo", "figure", "barcode", "photo"],
|
| 49 |
+
description="Types of visual elements to generate (stamp, logo, figure, barcode, photo)"
|
| 50 |
+
)
|
| 51 |
+
barcode_number: Optional[str] = Field(
|
| 52 |
+
default=None,
|
| 53 |
+
description="Optional fixed number for barcode generation (numeric only)"
|
| 54 |
+
)
|
| 55 |
+
seed: Optional[int] = Field(
|
| 56 |
+
default=None,
|
| 57 |
+
description="Random seed for reproducible generation",
|
| 58 |
+
examples=[None, 42]
|
| 59 |
+
)
|
| 60 |
+
# Stage 4: Image Finalization & OCR parameters
|
| 61 |
+
enable_ocr: bool = Field(
|
| 62 |
+
default=True,
|
| 63 |
+
description="Enable OCR on final document images (requires OCR service)"
|
| 64 |
+
)
|
| 65 |
+
ocr_language: str = Field(
|
| 66 |
+
default="en",
|
| 67 |
+
description="Language for OCR (e.g., 'en', 'de', 'fr')"
|
| 68 |
+
)
|
| 69 |
+
# Stage 5: Dataset Packaging parameters
|
| 70 |
+
enable_bbox_normalization: bool = Field(
|
| 71 |
+
default=True,
|
| 72 |
+
description="Normalize bounding boxes to [0,1] scale (Stage 16)"
|
| 73 |
+
)
|
| 74 |
+
enable_gt_verification: bool = Field(
|
| 75 |
+
default=True,
|
| 76 |
+
description="Verify and prepare ground truth annotations (Stage 17)"
|
| 77 |
+
)
|
| 78 |
+
enable_analysis: bool = Field(
|
| 79 |
+
default=True,
|
| 80 |
+
description="Generate dataset statistics and analysis (Stage 18)"
|
| 81 |
+
)
|
| 82 |
+
enable_debug_visualization: bool = Field(
|
| 83 |
+
default=True,
|
| 84 |
+
description="Create debug visualization overlays (Stage 19)"
|
| 85 |
+
)
|
| 86 |
+
enable_dataset_export: bool = Field(
|
| 87 |
+
default=True,
|
| 88 |
+
description="Export as msgpack dataset format"
|
| 89 |
+
)
|
| 90 |
+
dataset_export_format: str = Field(
|
| 91 |
+
default="msgpack",
|
| 92 |
+
description="Dataset export format: 'msgpack', 'coco', 'huggingface'"
|
| 93 |
+
)
|
| 94 |
+
output_detail: str = Field(
|
| 95 |
+
default="dataset",
|
| 96 |
+
description="Output detail level: 'minimal' (final outputs only), 'dataset' (includes individual tokens/elements for ML), 'complete' (all intermediate files and debug info). Warning: 'complete' mode can produce 50+ MB responses."
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class SeedImage(BaseModel):
|
| 101 |
+
"""Seed image URL for document generation."""
|
| 102 |
+
url: HttpUrl = Field(
|
| 103 |
+
description="URL of the seed image",
|
| 104 |
+
default=HttpUrl("https://ocr.space/Content/Images/receipt-ocr-original.webp")
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class GenerateDocumentRequest(BaseModel):
|
| 109 |
+
"""Request schema for document generation endpoint."""
|
| 110 |
+
request_id: str = Field(
|
| 111 |
+
description="Document request UUID from document_requests table (created by frontend)"
|
| 112 |
+
)
|
| 113 |
+
google_drive_token: Optional[str] = Field(
|
| 114 |
+
default=None,
|
| 115 |
+
description="Google Drive OAuth access token. Frontend provides this after OAuth flow (optional)."
|
| 116 |
+
)
|
| 117 |
+
google_drive_refresh_token: Optional[str] = Field(
|
| 118 |
+
default=None,
|
| 119 |
+
description="Google Drive refresh token (optional, for automatic token renewal)"
|
| 120 |
+
)
|
| 121 |
+
seed_images: List[HttpUrl] = Field(
|
| 122 |
+
default=[HttpUrl("https://ocr.space/Content/Images/receipt-ocr-original.webp")],
|
| 123 |
+
description="List of seed image URLs (1-10 images)"
|
| 124 |
+
)
|
| 125 |
+
prompt_params: PromptParameters = Field(
|
| 126 |
+
default_factory=PromptParameters,
|
| 127 |
+
description="Parameters for customizing the generation prompt"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
@field_validator('seed_images')
|
| 131 |
+
@classmethod
|
| 132 |
+
def validate_seed_images(cls, v):
|
| 133 |
+
if not v:
|
| 134 |
+
raise ValueError('At least one seed image is required')
|
| 135 |
+
if len(v) < 1:
|
| 136 |
+
raise ValueError('At least one seed image is required')
|
| 137 |
+
if len(v) > 10:
|
| 138 |
+
raise ValueError('Maximum 10 seed images allowed')
|
| 139 |
+
return v
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class OCRWord(BaseModel):
|
| 143 |
+
"""OCR word-level result."""
|
| 144 |
+
text: str = Field(description="Recognized text")
|
| 145 |
+
confidence: float = Field(ge=0.0, le=1.0, description="OCR confidence score (0-1)")
|
| 146 |
+
x: float = Field(description="X coordinate (pixels)")
|
| 147 |
+
y: float = Field(description="Y coordinate (pixels)")
|
| 148 |
+
width: float = Field(description="Width (pixels)")
|
| 149 |
+
height: float = Field(description="Height (pixels)")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
class OCRLine(BaseModel):
|
| 153 |
+
"""OCR line-level result."""
|
| 154 |
+
text: str = Field(description="Recognized text")
|
| 155 |
+
confidence: float = Field(ge=0.0, le=1.0, description="OCR confidence score (0-1)")
|
| 156 |
+
x: float = Field(description="X coordinate (pixels)")
|
| 157 |
+
y: float = Field(description="Y coordinate (pixels)")
|
| 158 |
+
width: float = Field(description="Width (pixels)")
|
| 159 |
+
height: float = Field(description="Height (pixels)")
|
| 160 |
+
words: List[OCRWord] = Field(default_factory=list, description="Words in this line")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
class OCRResult(BaseModel):
|
| 164 |
+
"""OCR results for a document."""
|
| 165 |
+
image_width: int = Field(description="Image width in pixels")
|
| 166 |
+
image_height: int = Field(description="Image height in pixels")
|
| 167 |
+
words: List[OCRWord] = Field(default_factory=list, description="Word-level OCR results")
|
| 168 |
+
lines: List[OCRLine] = Field(default_factory=list, description="Line-level OCR results")
|
| 169 |
+
angle: float = Field(default=0.0, description="Detected text orientation angle")
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class NormalizedBBox(BaseModel):
|
| 173 |
+
"""Normalized bounding box (Stage 16)."""
|
| 174 |
+
text: str = Field(description="Text content")
|
| 175 |
+
x0: float = Field(ge=0.0, le=1.0, description="Normalized X min (0-1)")
|
| 176 |
+
y0: float = Field(ge=0.0, le=1.0, description="Normalized Y min (0-1)")
|
| 177 |
+
x2: float = Field(ge=0.0, le=1.0, description="Normalized X max (0-1)")
|
| 178 |
+
y2: float = Field(ge=0.0, le=1.0, description="Normalized Y max (0-1)")
|
| 179 |
+
block_no: Optional[int] = Field(default=None, description="Block number")
|
| 180 |
+
line_no: Optional[int] = Field(default=None, description="Line number")
|
| 181 |
+
word_no: Optional[int] = Field(default=None, description="Word number")
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
class GTVerificationResult(BaseModel):
|
| 185 |
+
"""Ground truth verification results (Stage 17)."""
|
| 186 |
+
passed: bool = Field(description="Whether GT verification passed")
|
| 187 |
+
skipped: bool = Field(default=False, description="Whether verification was skipped")
|
| 188 |
+
confirmed_keys: List[str] = Field(default_factory=list, description="Confirmed GT keys")
|
| 189 |
+
similarities: List[float] = Field(default_factory=list, description="Similarity scores")
|
| 190 |
+
num_layout_elements: Optional[int] = Field(default=None, description="Number of layout elements")
|
| 191 |
+
valid_labels: bool = Field(default=True, description="Whether all labels are valid")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class AnalysisStats(BaseModel):
|
| 195 |
+
"""Dataset analysis and statistics (Stage 18)."""
|
| 196 |
+
total_documents: int = Field(description="Total documents processed")
|
| 197 |
+
valid_documents: int = Field(description="Documents passing all validation")
|
| 198 |
+
error_counts: dict = Field(default_factory=dict, description="Error type counts")
|
| 199 |
+
has_handwriting: int = Field(default=0, description="Documents with handwriting")
|
| 200 |
+
has_visual_elements: int = Field(default=0, description="Documents with visual elements")
|
| 201 |
+
has_ocr: int = Field(default=0, description="Documents with OCR results")
|
| 202 |
+
multipage_count: int = Field(default=0, description="Multipage documents")
|
| 203 |
+
token_usage: Optional[dict] = Field(default=None, description="LLM token usage statistics")
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
class DebugVisualization(BaseModel):
|
| 207 |
+
"""Debug visualization data (Stage 19)."""
|
| 208 |
+
bbox_overlay_base64: Optional[str] = Field(default=None, description="Image with bbox overlays (PNG base64)")
|
| 209 |
+
visual_elements_overlay_base64: Optional[str] = Field(default=None, description="Image with visual element overlays")
|
| 210 |
+
handwriting_overlay_base64: Optional[str] = Field(default=None, description="Image with handwriting overlays")
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
class DatasetExportInfo(BaseModel):
|
| 214 |
+
"""Dataset export metadata."""
|
| 215 |
+
format: str = Field(description="Export format (msgpack, coco, etc.)")
|
| 216 |
+
num_samples: int = Field(description="Number of samples in export")
|
| 217 |
+
output_path: Optional[str] = Field(default=None, description="Path to exported dataset")
|
| 218 |
+
msgpack_base64: Optional[str] = Field(default=None, description="Msgpack file as base64 (for small datasets)")
|
| 219 |
+
metadata: dict = Field(default_factory=dict, description="Dataset metadata")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
class BoundingBox(BaseModel):
|
| 223 |
+
"""Bounding box for a text element in the document."""
|
| 224 |
+
text: str = Field(description="Text content")
|
| 225 |
+
x: float = Field(description="X coordinate (normalized 0-1)")
|
| 226 |
+
y: float = Field(description="Y coordinate (normalized 0-1)")
|
| 227 |
+
width: float = Field(description="Width (normalized 0-1)")
|
| 228 |
+
height: float = Field(description="Height (normalized 0-1)")
|
| 229 |
+
page: int = Field(default=0, description="Page number (0-indexed)")
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
class HandwritingRegion(BaseModel):
|
| 233 |
+
"""Information about a handwriting region in the document."""
|
| 234 |
+
region_id: str = Field(description="Unique region identifier")
|
| 235 |
+
text: str = Field(description="Text content")
|
| 236 |
+
author_id: int = Field(ge=0, le=656, description="Author ID for style consistency (0-656)")
|
| 237 |
+
bbox: BoundingBox = Field(description="Bounding box of the region")
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
class VisualElement(BaseModel):
|
| 241 |
+
"""Information about a visual element in the document."""
|
| 242 |
+
element_id: str = Field(description="Unique element identifier")
|
| 243 |
+
element_type: str = Field(description="Type of visual element (stamp, logo, etc.)")
|
| 244 |
+
content: Optional[str] = Field(default=None, description="Content (e.g., stamp text)")
|
| 245 |
+
bbox: BoundingBox = Field(description="Bounding box of the element")
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
class DocumentResult(BaseModel):
|
| 249 |
+
"""Result for a single generated document."""
|
| 250 |
+
document_id: str = Field(description="Unique document identifier")
|
| 251 |
+
html: str = Field(description="Generated HTML content")
|
| 252 |
+
css: str = Field(description="Extracted CSS styles")
|
| 253 |
+
ground_truth: Optional[dict] = Field(
|
| 254 |
+
default=None,
|
| 255 |
+
description="Ground truth data extracted from the document"
|
| 256 |
+
)
|
| 257 |
+
pdf_base64: str = Field(description="Base64-encoded PDF document")
|
| 258 |
+
bboxes: List[BoundingBox] = Field(
|
| 259 |
+
default_factory=list,
|
| 260 |
+
description="Bounding boxes for text elements"
|
| 261 |
+
)
|
| 262 |
+
page_width_mm: float = Field(description="Page width in millimeters")
|
| 263 |
+
page_height_mm: float = Field(description="Page height in millimeters")
|
| 264 |
+
# Stage 3 additions
|
| 265 |
+
handwriting_regions: Optional[List[dict]] = Field(
|
| 266 |
+
default=None,
|
| 267 |
+
description="Handwriting regions with metadata (if enabled)"
|
| 268 |
+
)
|
| 269 |
+
visual_elements: Optional[List[dict]] = Field(
|
| 270 |
+
default=None,
|
| 271 |
+
description="Visual elements with metadata (if enabled)"
|
| 272 |
+
)
|
| 273 |
+
image_base64: Optional[str] = Field(
|
| 274 |
+
default=None,
|
| 275 |
+
description="Final rendered image with handwriting/visuals (PNG base64, if Stage 3 enabled)"
|
| 276 |
+
)
|
| 277 |
+
# Stage 3 individual tokens (dataset/complete output detail levels)
|
| 278 |
+
handwriting_token_images: Optional[dict] = Field(
|
| 279 |
+
default=None,
|
| 280 |
+
description="Individual handwriting token images {hw_id: base64_png} (output_detail: dataset/complete)"
|
| 281 |
+
)
|
| 282 |
+
visual_element_images: Optional[dict] = Field(
|
| 283 |
+
default=None,
|
| 284 |
+
description="Individual visual element images {ve_id: base64_png} (output_detail: dataset/complete)"
|
| 285 |
+
)
|
| 286 |
+
token_mapping: Optional[dict] = Field(
|
| 287 |
+
default=None,
|
| 288 |
+
description="Token mapping with positions and style IDs (output_detail: dataset/complete)"
|
| 289 |
+
)
|
| 290 |
+
# Stage 4 additions
|
| 291 |
+
ocr_results: Optional[OCRResult] = Field(
|
| 292 |
+
default=None,
|
| 293 |
+
description="OCR results from final image (if OCR enabled)"
|
| 294 |
+
)
|
| 295 |
+
# Stage 5 additions
|
| 296 |
+
normalized_bboxes_word: Optional[List[NormalizedBBox]] = Field(
|
| 297 |
+
default=None,
|
| 298 |
+
description="Word-level normalized bounding boxes (if Stage 16 enabled)"
|
| 299 |
+
)
|
| 300 |
+
normalized_bboxes_segment: Optional[List[NormalizedBBox]] = Field(
|
| 301 |
+
default=None,
|
| 302 |
+
description="Segment-level normalized bounding boxes (if Stage 16 enabled)"
|
| 303 |
+
)
|
| 304 |
+
gt_verification: Optional[GTVerificationResult] = Field(
|
| 305 |
+
default=None,
|
| 306 |
+
description="Ground truth verification results (if Stage 17 enabled)"
|
| 307 |
+
)
|
| 308 |
+
analysis_stats: Optional[AnalysisStats] = Field(
|
| 309 |
+
default=None,
|
| 310 |
+
description="Document analysis statistics (if Stage 18 enabled)"
|
| 311 |
+
)
|
| 312 |
+
debug_visualization: Optional[DebugVisualization] = Field(
|
| 313 |
+
default=None,
|
| 314 |
+
description="Debug visualization overlays (if Stage 19 enabled)"
|
| 315 |
+
)
|
| 316 |
+
dataset_export: Optional[DatasetExportInfo] = Field(
|
| 317 |
+
default=None,
|
| 318 |
+
description="Dataset export information (if export enabled)"
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
class GenerateDocumentResponse(BaseModel):
|
| 323 |
+
"""Response schema for document generation endpoint."""
|
| 324 |
+
success: bool = Field(description="Whether generation was successful")
|
| 325 |
+
message: str = Field(description="Status message")
|
| 326 |
+
documents: List[DocumentResult] = Field(
|
| 327 |
+
default_factory=list,
|
| 328 |
+
description="List of generated documents"
|
| 329 |
+
)
|
| 330 |
+
total_documents: int = Field(
|
| 331 |
+
default=0,
|
| 332 |
+
description="Total number of documents generated"
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
class HealthResponse(BaseModel):
|
| 337 |
+
"""Health check response."""
|
| 338 |
+
status: str = Field(default="healthy")
|
| 339 |
+
version: str = Field(default="1.0.0")
|
api/start.sh
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Start the DocGenie API server
|
| 4 |
+
# Note: All dependencies should be installed via 'uv sync' or 'pip install -e .'
|
| 5 |
+
|
| 6 |
+
echo "Starting DocGenie API..."
|
| 7 |
+
|
| 8 |
+
# Check if .env file exists
|
| 9 |
+
if [ ! -f .env ]; then
|
| 10 |
+
echo "Warning: .env file not found. Using .env.example as template."
|
| 11 |
+
echo "Please copy .env.example to .env and set your ANTHROPIC_API_KEY"
|
| 12 |
+
|
| 13 |
+
if [ -f .env.example ]; then
|
| 14 |
+
cp .env.example .env
|
| 15 |
+
echo "Created .env file from .env.example"
|
| 16 |
+
fi
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
# Load environment variables
|
| 20 |
+
if [ -f .env ]; then
|
| 21 |
+
export $(cat .env | grep -v '^#' | xargs)
|
| 22 |
+
fi
|
| 23 |
+
|
| 24 |
+
# Check if ANTHROPIC_API_KEY is set
|
| 25 |
+
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
| 26 |
+
echo "Error: ANTHROPIC_API_KEY not set in .env file"
|
| 27 |
+
exit 1
|
| 28 |
+
fi
|
| 29 |
+
|
| 30 |
+
# Default values
|
| 31 |
+
HOST=${API_HOST:-0.0.0.0}
|
| 32 |
+
PORT=${API_PORT:-8000}
|
| 33 |
+
WORKERS=${API_WORKERS:-4}
|
| 34 |
+
|
| 35 |
+
echo "Configuration:"
|
| 36 |
+
echo " Host: $HOST"
|
| 37 |
+
echo " Port: $PORT"
|
| 38 |
+
echo " Workers: $WORKERS"
|
| 39 |
+
echo ""
|
| 40 |
+
|
| 41 |
+
# Start the API
|
| 42 |
+
uvicorn main:app --host $HOST --port $PORT --workers $WORKERS --reload
|
api/start_worker.sh
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ============================================
|
| 3 |
+
# DocGenie RQ Worker Startup Script
|
| 4 |
+
# ============================================
|
| 5 |
+
# This script starts an RQ (Redis Queue) worker for processing
|
| 6 |
+
# background document generation jobs.
|
| 7 |
+
|
| 8 |
+
set -e # Exit on error
|
| 9 |
+
|
| 10 |
+
echo "🚀 Starting DocGenie RQ Worker..."
|
| 11 |
+
|
| 12 |
+
# Activate virtual environment
|
| 13 |
+
VENV_PATH="../.venv"
|
| 14 |
+
if [ -d "$VENV_PATH" ]; then
|
| 15 |
+
echo "✓ Activating virtual environment..."
|
| 16 |
+
source "$VENV_PATH/bin/activate"
|
| 17 |
+
else
|
| 18 |
+
echo "⚠ Warning: Virtual environment not found at $VENV_PATH"
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
# Load environment variables from .env using Python (handles special characters properly)
|
| 22 |
+
if [ -f .env ]; then
|
| 23 |
+
echo "✓ Loading .env file..."
|
| 24 |
+
eval $(python -c "
|
| 25 |
+
import os
|
| 26 |
+
from dotenv import load_dotenv
|
| 27 |
+
load_dotenv()
|
| 28 |
+
for key, value in os.environ.items():
|
| 29 |
+
# Only export DocGenie related variables
|
| 30 |
+
if key.startswith(('REDIS', 'SUPABASE', 'ANTHROPIC', 'BATCH', 'MESSAGE', 'RQ_', 'GOOGLE')):
|
| 31 |
+
# Properly escape single quotes in the value
|
| 32 |
+
safe_value = value.replace(\"'\", \"'\\\\''\" )
|
| 33 |
+
print(f\"export {key}='{safe_value}'\")
|
| 34 |
+
")
|
| 35 |
+
else
|
| 36 |
+
echo "⚠ Warning: .env file not found"
|
| 37 |
+
fi
|
| 38 |
+
|
| 39 |
+
# Check Redis connection
|
| 40 |
+
echo "🔍 Checking Redis connection..."
|
| 41 |
+
if ! python -c "import redis; r = redis.from_url('${REDIS_URL:-redis://localhost:6379/0}'); r.ping()" 2>/dev/null; then
|
| 42 |
+
echo "❌ Error: Cannot connect to Redis at ${REDIS_URL:-redis://localhost:6379/0}"
|
| 43 |
+
echo " Please ensure Redis is running:"
|
| 44 |
+
echo " $ docker run -d -p 6379:6379 redis:latest"
|
| 45 |
+
echo " OR"
|
| 46 |
+
echo " $ redis-server"
|
| 47 |
+
exit 1
|
| 48 |
+
fi
|
| 49 |
+
echo "✓ Redis connected"
|
| 50 |
+
|
| 51 |
+
# Check Supabase configuration
|
| 52 |
+
if [ -z "$SUPABASE_URL" ] || [ -z "$SUPABASE_KEY" ]; then
|
| 53 |
+
echo "❌ Error: SUPABASE_URL and SUPABASE_KEY must be set in .env"
|
| 54 |
+
exit 1
|
| 55 |
+
fi
|
| 56 |
+
echo "✓ Supabase configured"
|
| 57 |
+
|
| 58 |
+
# Check Claude API key
|
| 59 |
+
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
| 60 |
+
echo "❌ Error: ANTHROPIC_API_KEY must be set in .env"
|
| 61 |
+
exit 1
|
| 62 |
+
fi
|
| 63 |
+
echo "✓ Claude API key configured"
|
| 64 |
+
|
| 65 |
+
# Create temporary directories
|
| 66 |
+
mkdir -p "${BATCH_DATA_DIR:-/tmp/docgenie_batches}"
|
| 67 |
+
mkdir -p "${MESSAGE_DATA_DIR:-/tmp/docgenie_messages}"
|
| 68 |
+
echo "✓ Temporary directories created"
|
| 69 |
+
|
| 70 |
+
# Start worker
|
| 71 |
+
QUEUE_NAME="${RQ_QUEUE_NAME:-docgenie}"
|
| 72 |
+
echo ""
|
| 73 |
+
echo "============================================"
|
| 74 |
+
echo "Worker Configuration:"
|
| 75 |
+
echo " Queue: $QUEUE_NAME"
|
| 76 |
+
echo " Redis: ${REDIS_URL:-redis://localhost:6379/0}"
|
| 77 |
+
echo " Batch Data: ${BATCH_DATA_DIR:-/tmp/docgenie_batches}"
|
| 78 |
+
echo " Message Data: ${MESSAGE_DATA_DIR:-/tmp/docgenie_messages}"
|
| 79 |
+
echo "============================================"
|
| 80 |
+
echo ""
|
| 81 |
+
echo "✅ Starting RQ worker (press Ctrl+C to stop)..."
|
| 82 |
+
echo ""
|
| 83 |
+
|
| 84 |
+
# Run RQ worker
|
| 85 |
+
# - Listen on specified queue
|
| 86 |
+
# - Burst mode: exit when queue is empty (use for testing)
|
| 87 |
+
# - Remove --burst for production (keeps running)
|
| 88 |
+
# Use PYTHONPATH to ensure worker.py can be imported
|
| 89 |
+
PYTHONPATH="$(pwd):$PYTHONPATH" rq worker "$QUEUE_NAME" \
|
| 90 |
+
--url "${REDIS_URL:-redis://localhost:6379/0}" \
|
| 91 |
+
--verbose
|
| 92 |
+
# --burst # Uncomment for testing (exit when queue empty)
|
| 93 |
+
|
| 94 |
+
# Note: Worker will keep running until Ctrl+C is pressed
|
| 95 |
+
# In production, use a process manager like systemd or supervisor
|
api/supabase_client.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Supabase client for database operations.
|
| 3 |
+
Handles document requests, generated documents, and user integrations.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Optional, Dict, Any, List
|
| 7 |
+
import os
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from supabase import create_client, Client
|
| 10 |
+
from .config import settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SupabaseClient:
|
| 14 |
+
"""Wrapper for Supabase operations related to document generation"""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
if not settings.SUPABASE_URL or not settings.SUPABASE_KEY:
|
| 18 |
+
raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set in environment")
|
| 19 |
+
|
| 20 |
+
self.client: Client = create_client(
|
| 21 |
+
settings.SUPABASE_URL,
|
| 22 |
+
settings.SUPABASE_KEY
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# ==================== Document Requests ====================
|
| 26 |
+
|
| 27 |
+
def create_document_request(
|
| 28 |
+
self,
|
| 29 |
+
user_id: int,
|
| 30 |
+
metadata: Dict[str, Any],
|
| 31 |
+
status: str = "pending"
|
| 32 |
+
) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Create a new document generation request.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
user_id: User ID from users table
|
| 38 |
+
metadata: Request parameters (seed_images, prompt_params, etc.)
|
| 39 |
+
status: Initial status (default: 'pending')
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
request_id (UUID)
|
| 43 |
+
"""
|
| 44 |
+
result = self.client.table("document_requests").insert({
|
| 45 |
+
"user_id": user_id,
|
| 46 |
+
"metadata": metadata,
|
| 47 |
+
"status": status,
|
| 48 |
+
"created_at": datetime.now().isoformat(),
|
| 49 |
+
"updated_at": datetime.now().isoformat()
|
| 50 |
+
}).execute()
|
| 51 |
+
|
| 52 |
+
return result.data[0]["id"]
|
| 53 |
+
|
| 54 |
+
def update_request_status(
|
| 55 |
+
self,
|
| 56 |
+
request_id: str,
|
| 57 |
+
status: str,
|
| 58 |
+
error_message: Optional[str] = None
|
| 59 |
+
):
|
| 60 |
+
"""
|
| 61 |
+
Update document request status.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
request_id: UUID of the request
|
| 65 |
+
status: New status (pending, processing, generating, completed, failed, etc.)
|
| 66 |
+
error_message: Error message if status is 'failed'
|
| 67 |
+
"""
|
| 68 |
+
update_data = {
|
| 69 |
+
"status": status,
|
| 70 |
+
"updated_at": datetime.now().isoformat()
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
if error_message:
|
| 74 |
+
update_data["error_message"] = error_message
|
| 75 |
+
|
| 76 |
+
self.client.table("document_requests").update(update_data).eq(
|
| 77 |
+
"id", request_id
|
| 78 |
+
).execute()
|
| 79 |
+
|
| 80 |
+
def get_request(self, request_id: str) -> Optional[Dict[str, Any]]:
|
| 81 |
+
"""
|
| 82 |
+
Get document request by ID.
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Dict with keys: id, user_id, metadata, status, created_at, updated_at, error_message
|
| 86 |
+
"""
|
| 87 |
+
result = self.client.table("document_requests").select("*").eq(
|
| 88 |
+
"id", request_id
|
| 89 |
+
).execute()
|
| 90 |
+
|
| 91 |
+
return result.data[0] if result.data else None
|
| 92 |
+
|
| 93 |
+
def get_user_id_from_request(self, request_id: str) -> Optional[int]:
|
| 94 |
+
"""
|
| 95 |
+
Get user_id from a document request.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
request_id: Document request UUID
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
user_id or None if request not found
|
| 102 |
+
"""
|
| 103 |
+
result = self.client.table("document_requests").select("user_id").eq(
|
| 104 |
+
"id", request_id
|
| 105 |
+
).execute()
|
| 106 |
+
|
| 107 |
+
return result.data[0]["user_id"] if result.data else None
|
| 108 |
+
|
| 109 |
+
def get_user_requests(
|
| 110 |
+
self,
|
| 111 |
+
user_id: int,
|
| 112 |
+
limit: int = 50,
|
| 113 |
+
offset: int = 0
|
| 114 |
+
) -> List[Dict[str, Any]]:
|
| 115 |
+
"""Get all requests for a user, ordered by created_at DESC"""
|
| 116 |
+
result = self.client.table("document_requests").select(
|
| 117 |
+
"*"
|
| 118 |
+
).eq("user_id", user_id).order(
|
| 119 |
+
"created_at", desc=True
|
| 120 |
+
).range(offset, offset + limit - 1).execute()
|
| 121 |
+
|
| 122 |
+
return result.data
|
| 123 |
+
|
| 124 |
+
# ==================== Generated Documents ====================
|
| 125 |
+
|
| 126 |
+
def create_generated_document(
|
| 127 |
+
self,
|
| 128 |
+
request_id: str,
|
| 129 |
+
file_url: Optional[str] = None,
|
| 130 |
+
file_type: Optional[str] = None,
|
| 131 |
+
model_version: Optional[str] = None,
|
| 132 |
+
doc_index: Optional[int] = None,
|
| 133 |
+
doc_storage_path: Optional[str] = None,
|
| 134 |
+
gt_storage_path: Optional[str] = None,
|
| 135 |
+
html_storage_path: Optional[str] = None,
|
| 136 |
+
bbox_storage_path: Optional[str] = None,
|
| 137 |
+
zip_url: Optional[str] = None,
|
| 138 |
+
flagged: bool = False,
|
| 139 |
+
flag_reason: Optional[str] = None
|
| 140 |
+
) -> str:
|
| 141 |
+
"""
|
| 142 |
+
Create record for a generated document.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
request_id: Parent request UUID (FK to document_requests)
|
| 146 |
+
file_url: Google Drive URL or other storage URL
|
| 147 |
+
file_type: MIME type (e.g., 'application/zip', 'application/pdf')
|
| 148 |
+
model_version: Model version used for generation (optional)
|
| 149 |
+
doc_index: Index of the document within the request (optional)
|
| 150 |
+
doc_storage_path: Path to the generated PDF in Supabase storage (optional)
|
| 151 |
+
gt_storage_path: Path to the ground truth JSON in Supabase storage (optional)
|
| 152 |
+
html_storage_path: Path to the HTML source in Supabase storage (optional)
|
| 153 |
+
bbox_storage_path: Path to the bbox JSON in Supabase storage (optional)
|
| 154 |
+
flagged: Whether the document is flagged for review
|
| 155 |
+
flag_reason: Reason for flagging
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
id (UUID) - Database record ID
|
| 159 |
+
"""
|
| 160 |
+
insert_data = {
|
| 161 |
+
"request_id": request_id,
|
| 162 |
+
"created_at": datetime.now().isoformat(),
|
| 163 |
+
"updated_at": datetime.now().isoformat(),
|
| 164 |
+
"flagged": flagged
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
if file_url is not None:
|
| 168 |
+
insert_data["file_url"] = file_url
|
| 169 |
+
if file_type is not None:
|
| 170 |
+
insert_data["file_type"] = file_type
|
| 171 |
+
if model_version is not None:
|
| 172 |
+
insert_data["model_version"] = model_version
|
| 173 |
+
if doc_index is not None:
|
| 174 |
+
insert_data["doc_index"] = doc_index
|
| 175 |
+
if doc_storage_path is not None:
|
| 176 |
+
insert_data["doc_storage_path"] = doc_storage_path
|
| 177 |
+
if gt_storage_path is not None:
|
| 178 |
+
insert_data["gt_storage_path"] = gt_storage_path
|
| 179 |
+
if html_storage_path is not None:
|
| 180 |
+
insert_data["html_storage_path"] = html_storage_path
|
| 181 |
+
if bbox_storage_path is not None:
|
| 182 |
+
insert_data["bbox_storage_path"] = bbox_storage_path
|
| 183 |
+
if zip_url is not None:
|
| 184 |
+
insert_data["zip_url"] = zip_url
|
| 185 |
+
if flag_reason is not None:
|
| 186 |
+
insert_data["flag_reason"] = flag_reason
|
| 187 |
+
|
| 188 |
+
result = self.client.table("generated_documents").insert(insert_data).execute()
|
| 189 |
+
|
| 190 |
+
return result.data[0]["id"]
|
| 191 |
+
|
| 192 |
+
def upload_to_storage(
|
| 193 |
+
self,
|
| 194 |
+
bucket_name: str,
|
| 195 |
+
path: str,
|
| 196 |
+
file_bytes: bytes,
|
| 197 |
+
content_type: str
|
| 198 |
+
) -> Dict[str, Any]:
|
| 199 |
+
"""
|
| 200 |
+
Upload a file to Supabase storage.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
bucket_name: The name of the Supabase storage bucket
|
| 204 |
+
path: The path/filename to store the file as
|
| 205 |
+
file_bytes: The raw bytes of the file
|
| 206 |
+
content_type: MIME type of the file
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
Upload result dictionary containing the path
|
| 210 |
+
"""
|
| 211 |
+
return self.client.storage.from_(bucket_name).upload(
|
| 212 |
+
file=file_bytes,
|
| 213 |
+
path=path,
|
| 214 |
+
file_options={"content-type": content_type, "upsert": "true"}
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
def list_files(self, bucket_name: str, path: str) -> List[Dict[str, Any]]:
|
| 218 |
+
"""List files in a Supabase storage bucket at a given path."""
|
| 219 |
+
return self.client.storage.from_(bucket_name).list(path)
|
| 220 |
+
|
| 221 |
+
def download_file(self, bucket_name: str, path: str) -> bytes:
|
| 222 |
+
"""Download a file from Supabase storage."""
|
| 223 |
+
return self.client.storage.from_(bucket_name).download(path)
|
| 224 |
+
|
| 225 |
+
def get_public_url(self, bucket_name: str, path: str) -> str:
|
| 226 |
+
"""Get the public URL for a file in Supabase storage."""
|
| 227 |
+
return self.client.storage.from_(bucket_name).get_public_url(path)
|
| 228 |
+
|
| 229 |
+
def get_generated_documents(
|
| 230 |
+
self,
|
| 231 |
+
request_id: str
|
| 232 |
+
) -> List[Dict[str, Any]]:
|
| 233 |
+
"""Get all generated documents for a request"""
|
| 234 |
+
result = self.client.table("generated_documents").select("*").eq(
|
| 235 |
+
"request_id", request_id
|
| 236 |
+
).execute()
|
| 237 |
+
|
| 238 |
+
return result.data
|
| 239 |
+
|
| 240 |
+
# ==================== User Integrations ====================
|
| 241 |
+
|
| 242 |
+
def get_user_google_drive_integration(
|
| 243 |
+
self,
|
| 244 |
+
user_id: int
|
| 245 |
+
) -> Optional[Dict[str, Any]]:
|
| 246 |
+
"""Get user's Google Drive integration credentials"""
|
| 247 |
+
result = self.client.table("user_integrations").select("*").eq(
|
| 248 |
+
"user_id", user_id
|
| 249 |
+
).eq("provider", "google_drive").execute()
|
| 250 |
+
|
| 251 |
+
return result.data[0] if result.data else None
|
| 252 |
+
|
| 253 |
+
def update_google_drive_tokens(
|
| 254 |
+
self,
|
| 255 |
+
user_id: int,
|
| 256 |
+
access_token: str,
|
| 257 |
+
refresh_token: Optional[str] = None,
|
| 258 |
+
expires_at: Optional[datetime] = None
|
| 259 |
+
):
|
| 260 |
+
"""[DEPRECATED] Update Google Drive OAuth tokens"""
|
| 261 |
+
# This method is deprecated - frontend now handles OAuth
|
| 262 |
+
# Kept for backward compatibility only
|
| 263 |
+
pass
|
| 264 |
+
|
| 265 |
+
# ==================== Analytics ====================
|
| 266 |
+
|
| 267 |
+
def log_analytics_event(
|
| 268 |
+
self,
|
| 269 |
+
user_id: int,
|
| 270 |
+
event_type: str,
|
| 271 |
+
entity_id: Optional[str] = None
|
| 272 |
+
):
|
| 273 |
+
"""Log analytics event"""
|
| 274 |
+
self.client.table("analytics_events").insert({
|
| 275 |
+
"user_id": user_id,
|
| 276 |
+
"event_type": event_type,
|
| 277 |
+
"entity_id": entity_id,
|
| 278 |
+
"created_at": datetime.now().isoformat()
|
| 279 |
+
}).execute()
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
# Global instance
|
| 283 |
+
supabase_client = SupabaseClient()
|
api/test_api.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for DocGenie API.
|
| 4 |
+
Verifies all components are properly installed and configured.
|
| 5 |
+
"""
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_imports():
|
| 12 |
+
"""Test that all required modules can be imported."""
|
| 13 |
+
print("Testing imports...")
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import fastapi
|
| 17 |
+
print(" ✓ FastAPI")
|
| 18 |
+
except ImportError as e:
|
| 19 |
+
print(f" ✗ FastAPI: {e}")
|
| 20 |
+
return False
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
import uvicorn
|
| 24 |
+
print(" ✓ Uvicorn")
|
| 25 |
+
except ImportError as e:
|
| 26 |
+
print(f" ✗ Uvicorn: {e}")
|
| 27 |
+
return False
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import pydantic
|
| 31 |
+
print(" ✓ Pydantic")
|
| 32 |
+
except ImportError as e:
|
| 33 |
+
print(f" ✗ Pydantic: {e}")
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
import requests
|
| 38 |
+
print(" ✓ Requests")
|
| 39 |
+
except ImportError as e:
|
| 40 |
+
print(f" ✗ Requests: {e}")
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
from PIL import Image
|
| 45 |
+
print(" ✓ Pillow")
|
| 46 |
+
except ImportError as e:
|
| 47 |
+
print(f" ✗ Pillow: {e}")
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
from bs4 import BeautifulSoup
|
| 52 |
+
print(" ✓ BeautifulSoup4")
|
| 53 |
+
except ImportError as e:
|
| 54 |
+
print(f" ✗ BeautifulSoup4: {e}")
|
| 55 |
+
return False
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
from playwright.async_api import async_playwright
|
| 59 |
+
print(" ✓ Playwright")
|
| 60 |
+
except ImportError as e:
|
| 61 |
+
print(f" ✗ Playwright: {e}")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
import anthropic
|
| 66 |
+
print(" ✓ Anthropic")
|
| 67 |
+
except ImportError as e:
|
| 68 |
+
print(f" ✗ Anthropic: {e}")
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
from docgenie import ENV
|
| 73 |
+
print(" ✓ DocGenie")
|
| 74 |
+
except ImportError as e:
|
| 75 |
+
print(f" ✗ DocGenie: {e}")
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
return True
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_api_structure():
|
| 82 |
+
"""Test that API files are in place."""
|
| 83 |
+
print("\nTesting API structure...")
|
| 84 |
+
|
| 85 |
+
api_dir = Path(__file__).parent
|
| 86 |
+
|
| 87 |
+
files = {
|
| 88 |
+
"main.py": "Main API application",
|
| 89 |
+
"schemas.py": "Request/Response models",
|
| 90 |
+
"utils.py": "Processing utilities",
|
| 91 |
+
"README.md": "Documentation",
|
| 92 |
+
"__init__.py": "Package init"
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
all_present = True
|
| 96 |
+
for filename, description in files.items():
|
| 97 |
+
filepath = api_dir / filename
|
| 98 |
+
if filepath.exists():
|
| 99 |
+
print(f" ✓ {filename}: {description}")
|
| 100 |
+
else:
|
| 101 |
+
print(f" ✗ {filename}: Missing!")
|
| 102 |
+
all_present = False
|
| 103 |
+
|
| 104 |
+
return all_present
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_docgenie_integration():
|
| 108 |
+
"""Test integration with DocGenie modules."""
|
| 109 |
+
print("\nTesting DocGenie integration...")
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
from docgenie import ENV
|
| 113 |
+
prompt_template = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
|
| 114 |
+
|
| 115 |
+
if prompt_template.exists():
|
| 116 |
+
print(f" ✓ Prompt template found: {prompt_template}")
|
| 117 |
+
else:
|
| 118 |
+
print(f" ✗ Prompt template not found: {prompt_template}")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
# Test reading template
|
| 122 |
+
content = prompt_template.read_text(encoding='utf-8')
|
| 123 |
+
if "{language}" in content and "{doc_type}" in content:
|
| 124 |
+
print(" ✓ Prompt template has required placeholders")
|
| 125 |
+
else:
|
| 126 |
+
print(" ✗ Prompt template missing placeholders")
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f" ✗ Error: {e}")
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def test_environment():
|
| 137 |
+
"""Test environment configuration."""
|
| 138 |
+
print("\nTesting environment...")
|
| 139 |
+
|
| 140 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 141 |
+
if api_key:
|
| 142 |
+
print(f" ✓ ANTHROPIC_API_KEY is set (length: {len(api_key)})")
|
| 143 |
+
else:
|
| 144 |
+
print(" ⚠ ANTHROPIC_API_KEY not set (optional for testing)")
|
| 145 |
+
|
| 146 |
+
python_version = sys.version_info
|
| 147 |
+
if python_version >= (3, 10):
|
| 148 |
+
print(f" ✓ Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
|
| 149 |
+
else:
|
| 150 |
+
print(f" ✗ Python version: {python_version.major}.{python_version.minor}.{python_version.micro} (3.10+ required)")
|
| 151 |
+
return False
|
| 152 |
+
|
| 153 |
+
return True
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def test_playwright_browsers():
|
| 157 |
+
"""Test if Playwright browsers are installed."""
|
| 158 |
+
print("\nTesting Playwright browsers...")
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
import subprocess
|
| 162 |
+
result = subprocess.run(
|
| 163 |
+
["playwright", "show-trace", "--help"],
|
| 164 |
+
capture_output=True,
|
| 165 |
+
timeout=5
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
if result.returncode == 0:
|
| 169 |
+
print(" ✓ Playwright CLI is available")
|
| 170 |
+
else:
|
| 171 |
+
print(" ⚠ Playwright CLI might have issues")
|
| 172 |
+
|
| 173 |
+
# Check if chromium is installed
|
| 174 |
+
# This is a basic check - actual browser installation is verified at runtime
|
| 175 |
+
print(" ℹ Chromium will be verified when rendering PDFs")
|
| 176 |
+
|
| 177 |
+
return True
|
| 178 |
+
|
| 179 |
+
except FileNotFoundError:
|
| 180 |
+
print(" ✗ Playwright CLI not found")
|
| 181 |
+
return False
|
| 182 |
+
except Exception as e:
|
| 183 |
+
print(f" ⚠ Could not verify Playwright: {e}")
|
| 184 |
+
return True # Non-critical for this test
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def test_api_modules():
|
| 188 |
+
"""Test that API modules can be imported."""
|
| 189 |
+
print("\nTesting API modules...")
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
# Add parent and current directory to path
|
| 193 |
+
api_dir = Path(__file__).parent
|
| 194 |
+
project_root = api_dir.parent
|
| 195 |
+
sys.path.insert(0, str(project_root))
|
| 196 |
+
sys.path.insert(0, str(api_dir))
|
| 197 |
+
|
| 198 |
+
import schemas
|
| 199 |
+
print(" ✓ schemas module")
|
| 200 |
+
|
| 201 |
+
import utils
|
| 202 |
+
print(" ✓ utils module")
|
| 203 |
+
|
| 204 |
+
# Test that schema models exist
|
| 205 |
+
schemas.GenerateDocumentRequest
|
| 206 |
+
schemas.GenerateDocumentResponse
|
| 207 |
+
schemas.DocumentResult
|
| 208 |
+
print(" ✓ All schema models defined")
|
| 209 |
+
|
| 210 |
+
return True
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f" ✗ Error importing API modules: {e}")
|
| 214 |
+
return False
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def main():
|
| 218 |
+
"""Run all tests."""
|
| 219 |
+
print("="*60)
|
| 220 |
+
print("DocGenie API - Test Suite")
|
| 221 |
+
print("="*60)
|
| 222 |
+
|
| 223 |
+
results = {
|
| 224 |
+
"Imports": test_imports(),
|
| 225 |
+
"API Structure": test_api_structure(),
|
| 226 |
+
"Environment": test_environment(),
|
| 227 |
+
"DocGenie Integration": test_docgenie_integration(),
|
| 228 |
+
"Playwright": test_playwright_browsers(),
|
| 229 |
+
"API Modules": test_api_modules()
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
print("\n" + "="*60)
|
| 233 |
+
print("Test Results Summary")
|
| 234 |
+
print("="*60)
|
| 235 |
+
|
| 236 |
+
for test_name, result in results.items():
|
| 237 |
+
status = "✓ PASS" if result else "✗ FAIL"
|
| 238 |
+
print(f"{status}: {test_name}")
|
| 239 |
+
|
| 240 |
+
all_passed = all(results.values())
|
| 241 |
+
|
| 242 |
+
print("\n" + "="*60)
|
| 243 |
+
if all_passed:
|
| 244 |
+
print("✅ All tests passed! API is ready to use.")
|
| 245 |
+
print("\nTo start the API:")
|
| 246 |
+
print(" cd api")
|
| 247 |
+
print(" python main.py")
|
| 248 |
+
print("\nThen visit: http://localhost:8000/docs")
|
| 249 |
+
else:
|
| 250 |
+
print("⚠️ Some tests failed. Please fix issues before running the API.")
|
| 251 |
+
print("\nCommon fixes:")
|
| 252 |
+
print(" uv sync # or: pip install -e .")
|
| 253 |
+
print(" playwright install chromium")
|
| 254 |
+
print(" export ANTHROPIC_API_KEY='your-key'")
|
| 255 |
+
print("="*60)
|
| 256 |
+
|
| 257 |
+
return 0 if all_passed else 1
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
if __name__ == "__main__":
|
| 261 |
+
sys.exit(main())
|
api/test_async_api.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for async document generation API with Google Drive upload.
|
| 4 |
+
|
| 5 |
+
Tests the complete async workflow with all features enabled:
|
| 6 |
+
- Handwriting insertion
|
| 7 |
+
- Visual elements (stamps, logos, figures, barcodes, photos)
|
| 8 |
+
- OCR processing
|
| 9 |
+
- Ground truth verification
|
| 10 |
+
- Analysis and debug visualization
|
| 11 |
+
- Dataset export
|
| 12 |
+
- Google Drive upload
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
python test_async_api.py
|
| 16 |
+
|
| 17 |
+
The script uses hardcoded tokens and polls continuously for status updates.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import requests
|
| 21 |
+
import time
|
| 22 |
+
import sys
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Configuration
|
| 26 |
+
BASE_URL = "http://localhost:8000"
|
| 27 |
+
POLL_INTERVAL = 10 # seconds between status checks
|
| 28 |
+
|
| 29 |
+
# Test payload with all features enabled
|
| 30 |
+
PAYLOAD = {
|
| 31 |
+
"user_id": 123,
|
| 32 |
+
"google_drive_token": "ya29.a0ATkoCc5wSA3DqNSI35d2EOCfLku0NWULKJYNMPhngjTwcnEKrcNcut1vawhiErgauHc85BrZdF5pug1xzp9Zu1oWATlzIMrMo5jqKDaXWThC0GuRifayOstjOetZnRLPRxVlmjx4k_xm7rto_pN6mT1CUrnte0Qkwf7FJVtF08JzJqaCG9Vvamag4OkkOhy-LB8MsUQaCgYKAXASARISFQHGX2MiAX_4jMvIlv2OkO7WurUUVA0206",
|
| 33 |
+
"google_drive_refresh_token": "1//03aLYGLUIYIl0CgYIARAAGAMSNwF-L9IrCfdJ-QHJHisqG86UjBvaEalyhWZdDcwbfLENt4V1ckik_wIkmsgjRwC9-SFeHrj-Yk4",
|
| 34 |
+
"seed_images": [
|
| 35 |
+
"https://ocr.space/Content/Images/receipt-ocr-original.webp"
|
| 36 |
+
],
|
| 37 |
+
"prompt_params": {
|
| 38 |
+
"language": "English",
|
| 39 |
+
"doc_type": "business and administrative",
|
| 40 |
+
"gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
|
| 41 |
+
"gt_format": "{\"<Text of question 1>\": \"<Answer to question 1>\", \"<Text of question 2>\": \"<Answer to question 2>\", ...}",
|
| 42 |
+
"num_solutions": 1,
|
| 43 |
+
"enable_handwriting": True,
|
| 44 |
+
"handwriting_ratio": 0.3,
|
| 45 |
+
"enable_visual_elements": True,
|
| 46 |
+
"visual_element_types": [
|
| 47 |
+
"stamp",
|
| 48 |
+
"logo",
|
| 49 |
+
"figure",
|
| 50 |
+
"barcode",
|
| 51 |
+
"photo"
|
| 52 |
+
],
|
| 53 |
+
"seed": None, # Use None for random behavior, or set to integer for reproducibility
|
| 54 |
+
"enable_ocr": True,
|
| 55 |
+
"ocr_language": "en",
|
| 56 |
+
"enable_bbox_normalization": True,
|
| 57 |
+
"enable_gt_verification": True,
|
| 58 |
+
"enable_analysis": True,
|
| 59 |
+
"enable_debug_visualization": True,
|
| 60 |
+
"enable_dataset_export": True,
|
| 61 |
+
"dataset_export_format": "msgpack",
|
| 62 |
+
"output_detail": "dataset"
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def test_health():
|
| 68 |
+
"""Test API health endpoint"""
|
| 69 |
+
print("=" * 80)
|
| 70 |
+
print("TESTING API HEALTH")
|
| 71 |
+
print("=" * 80)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
response = requests.get(f"{BASE_URL}/health", timeout=5)
|
| 75 |
+
response.raise_for_status()
|
| 76 |
+
print(f"✓ API is healthy: {response.json()}\n")
|
| 77 |
+
return True
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"✗ Health check failed: {e}\n")
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def submit_async_job():
|
| 84 |
+
"""Submit async document generation job"""
|
| 85 |
+
print("=" * 80)
|
| 86 |
+
print("SUBMITTING ASYNC JOB")
|
| 87 |
+
print("=" * 80)
|
| 88 |
+
|
| 89 |
+
print("\nConfiguration:")
|
| 90 |
+
print(f" User ID: {PAYLOAD['user_id']}")
|
| 91 |
+
print(f" Seed Images: {len(PAYLOAD['seed_images'])}")
|
| 92 |
+
print(f" Num Solutions: {PAYLOAD['prompt_params']['num_solutions']}")
|
| 93 |
+
print(f" Handwriting: {PAYLOAD['prompt_params']['enable_handwriting']} (ratio: {PAYLOAD['prompt_params']['handwriting_ratio']})")
|
| 94 |
+
print(f" Visual Elements: {PAYLOAD['prompt_params']['enable_visual_elements']} (types: {len(PAYLOAD['prompt_params']['visual_element_types'])})")
|
| 95 |
+
print(f" OCR: {PAYLOAD['prompt_params']['enable_ocr']}")
|
| 96 |
+
print(f" GT Verification: {PAYLOAD['prompt_params']['enable_gt_verification']}")
|
| 97 |
+
print(f" Analysis: {PAYLOAD['prompt_params']['enable_analysis']}")
|
| 98 |
+
print(f" Debug Viz: {PAYLOAD['prompt_params']['enable_debug_visualization']}")
|
| 99 |
+
print(f" Dataset Export: {PAYLOAD['prompt_params']['enable_dataset_export']}")
|
| 100 |
+
print(f" Google Drive Upload: Yes")
|
| 101 |
+
print()
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
print("⏳ Submitting job to /generate/async...")
|
| 105 |
+
response = requests.post(
|
| 106 |
+
f"{BASE_URL}/generate/async",
|
| 107 |
+
json=PAYLOAD,
|
| 108 |
+
timeout=30
|
| 109 |
+
)
|
| 110 |
+
response.raise_for_status()
|
| 111 |
+
result = response.json()
|
| 112 |
+
|
| 113 |
+
request_id = result["request_id"]
|
| 114 |
+
|
| 115 |
+
print(f"\n✓ Job submitted successfully!")
|
| 116 |
+
print(f" Request ID: {request_id}")
|
| 117 |
+
print(f" Status: {result['status']}")
|
| 118 |
+
print(f" Estimated Time: {result.get('estimated_time_minutes', 'N/A')} minutes")
|
| 119 |
+
print(f" Poll URL: {result.get('poll_url', 'N/A')}")
|
| 120 |
+
|
| 121 |
+
return request_id
|
| 122 |
+
|
| 123 |
+
except requests.exceptions.HTTPError as e:
|
| 124 |
+
print(f"\n✗ Job submission failed: {e}")
|
| 125 |
+
if e.response:
|
| 126 |
+
print(f" Response: {e.response.text}")
|
| 127 |
+
return None
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"\n✗ Unexpected error: {e}")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def poll_job_status(request_id):
|
| 134 |
+
"""Poll job status continuously until completion or failure"""
|
| 135 |
+
print("\n" + "=" * 80)
|
| 136 |
+
print("CONTINUOUS STATUS POLLING")
|
| 137 |
+
print("=" * 80)
|
| 138 |
+
print(f"Request ID: {request_id}")
|
| 139 |
+
print(f"Polling every {POLL_INTERVAL} seconds...")
|
| 140 |
+
print("Press Ctrl+C to stop polling\n")
|
| 141 |
+
|
| 142 |
+
poll_count = 0
|
| 143 |
+
last_status = None
|
| 144 |
+
last_progress = None
|
| 145 |
+
|
| 146 |
+
while True:
|
| 147 |
+
poll_count += 1
|
| 148 |
+
timestamp = time.strftime("%H:%M:%S")
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
response = requests.get(
|
| 152 |
+
f"{BASE_URL}/jobs/{request_id}/status",
|
| 153 |
+
timeout=10
|
| 154 |
+
)
|
| 155 |
+
response.raise_for_status()
|
| 156 |
+
status_data = response.json()
|
| 157 |
+
|
| 158 |
+
current_status = status_data["status"]
|
| 159 |
+
current_progress = status_data.get("progress")
|
| 160 |
+
|
| 161 |
+
# Only print if status or progress changed
|
| 162 |
+
if current_status != last_status or current_progress != last_progress:
|
| 163 |
+
print(f"[{timestamp}] Poll #{poll_count}: {current_status.upper()}", end="")
|
| 164 |
+
if current_progress:
|
| 165 |
+
print(f" - {current_progress}", end="")
|
| 166 |
+
print()
|
| 167 |
+
|
| 168 |
+
last_status = current_status
|
| 169 |
+
last_progress = current_progress
|
| 170 |
+
|
| 171 |
+
# Check terminal states
|
| 172 |
+
if current_status == "completed":
|
| 173 |
+
print("\n" + "=" * 80)
|
| 174 |
+
print("✓ JOB COMPLETED!")
|
| 175 |
+
print("=" * 80)
|
| 176 |
+
|
| 177 |
+
results = status_data.get('results', {})
|
| 178 |
+
download_url = results.get('download_url')
|
| 179 |
+
|
| 180 |
+
if download_url:
|
| 181 |
+
print(f" ✓ Google Drive URL: {download_url}")
|
| 182 |
+
else:
|
| 183 |
+
print(f" ⚠ Google Drive URL not available")
|
| 184 |
+
|
| 185 |
+
if results.get('file_size_mb'):
|
| 186 |
+
print(f" File Size: {results['file_size_mb']:.2f} MB")
|
| 187 |
+
|
| 188 |
+
print(f" Document Count: {results.get('document_count', 'N/A')}")
|
| 189 |
+
print(f" Created: {status_data.get('created_at')}")
|
| 190 |
+
print(f" Completed: {status_data.get('updated_at')}")
|
| 191 |
+
|
| 192 |
+
return status_data
|
| 193 |
+
|
| 194 |
+
elif current_status == "failed":
|
| 195 |
+
print("\n" + "=" * 80)
|
| 196 |
+
print("✗ JOB FAILED!")
|
| 197 |
+
print("=" * 80)
|
| 198 |
+
print(f" Error: {status_data.get('error_message', 'Unknown error')}")
|
| 199 |
+
print(f" Created: {status_data.get('created_at')}")
|
| 200 |
+
print(f" Failed: {status_data.get('updated_at')}")
|
| 201 |
+
return status_data
|
| 202 |
+
|
| 203 |
+
# Wait before next poll
|
| 204 |
+
time.sleep(POLL_INTERVAL)
|
| 205 |
+
|
| 206 |
+
except KeyboardInterrupt:
|
| 207 |
+
print("\n\n⚠ Polling interrupted by user")
|
| 208 |
+
print(f"You can continue polling manually:")
|
| 209 |
+
print(f" GET {BASE_URL}/jobs/{request_id}/status")
|
| 210 |
+
return {"status": "interrupted"}
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"\n⚠ Error polling status: {e}")
|
| 214 |
+
time.sleep(POLL_INTERVAL)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def list_user_jobs():
|
| 218 |
+
"""List all jobs for the test user"""
|
| 219 |
+
print("\n" + "=" * 80)
|
| 220 |
+
print("LISTING USER JOBS")
|
| 221 |
+
print("=" * 80)
|
| 222 |
+
|
| 223 |
+
user_id = PAYLOAD['user_id']
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
response = requests.get(
|
| 227 |
+
f"{BASE_URL}/jobs/user/{user_id}",
|
| 228 |
+
params={"limit": 10, "offset": 0},
|
| 229 |
+
timeout=10
|
| 230 |
+
)
|
| 231 |
+
response.raise_for_status()
|
| 232 |
+
result = response.json()
|
| 233 |
+
|
| 234 |
+
jobs = result.get("jobs", [])
|
| 235 |
+
print(f"\n✓ Found {len(jobs)} jobs for user {user_id}:\n")
|
| 236 |
+
|
| 237 |
+
for i, job in enumerate(jobs, 1):
|
| 238 |
+
print(f"{i}. Request {job['request_id'][:8]}...")
|
| 239 |
+
print(f" Status: {job['status']}")
|
| 240 |
+
print(f" Created: {job.get('created_at', 'N/A')}")
|
| 241 |
+
if job.get('download_url'):
|
| 242 |
+
print(f" Download: {job['download_url']}")
|
| 243 |
+
print()
|
| 244 |
+
|
| 245 |
+
return jobs
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"\n✗ Error listing jobs: {e}")
|
| 249 |
+
return []
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def main():
|
| 253 |
+
print("\n" + "=" * 80)
|
| 254 |
+
print(" " * 15 + "ASYNC PDF API TEST - FULL FEATURE SET")
|
| 255 |
+
print("=" * 80)
|
| 256 |
+
print(f"Base URL: {BASE_URL}")
|
| 257 |
+
print(f"User ID: {PAYLOAD['user_id']}")
|
| 258 |
+
print("=" * 80)
|
| 259 |
+
print()
|
| 260 |
+
|
| 261 |
+
# Step 1: Health check
|
| 262 |
+
if not test_health():
|
| 263 |
+
print("\n❌ API is not accessible. Make sure the server is running.")
|
| 264 |
+
print(f" Expected URL: {BASE_URL}")
|
| 265 |
+
sys.exit(1)
|
| 266 |
+
|
| 267 |
+
# Step 2: Submit job
|
| 268 |
+
request_id = submit_async_job()
|
| 269 |
+
|
| 270 |
+
if not request_id:
|
| 271 |
+
print("\n❌ Failed to submit job. Test aborted.")
|
| 272 |
+
sys.exit(1)
|
| 273 |
+
|
| 274 |
+
# Step 3: Poll status continuously
|
| 275 |
+
final_status = poll_job_status(request_id)
|
| 276 |
+
|
| 277 |
+
# Step 4: List all user jobs
|
| 278 |
+
list_user_jobs()
|
| 279 |
+
|
| 280 |
+
# Final summary
|
| 281 |
+
print("\n" + "=" * 80)
|
| 282 |
+
print(" " * 30 + "SUMMARY")
|
| 283 |
+
print("=" * 80)
|
| 284 |
+
|
| 285 |
+
status = final_status.get("status")
|
| 286 |
+
|
| 287 |
+
if status == "completed":
|
| 288 |
+
print("✅ ALL TESTS PASSED!")
|
| 289 |
+
print("\nFeatures tested:")
|
| 290 |
+
print(" ✓ Async job submission")
|
| 291 |
+
print(" ✓ Handwriting insertion")
|
| 292 |
+
print(" ✓ Visual elements (5 types)")
|
| 293 |
+
print(" ✓ OCR processing")
|
| 294 |
+
print(" ✓ Ground truth verification")
|
| 295 |
+
print(" ✓ Analysis & debug visualization")
|
| 296 |
+
print(" ✓ Dataset export")
|
| 297 |
+
print(" ✓ Google Drive upload")
|
| 298 |
+
print(" ✓ Continuous status polling")
|
| 299 |
+
print(f"\n✓ Your documents are available at:")
|
| 300 |
+
print(f" {final_status.get('results', {}).get('download_url')}")
|
| 301 |
+
sys.exit(0)
|
| 302 |
+
|
| 303 |
+
elif status == "failed":
|
| 304 |
+
print("❌ JOB FAILED")
|
| 305 |
+
print(f"Error: {final_status.get('error_message')}")
|
| 306 |
+
sys.exit(1)
|
| 307 |
+
|
| 308 |
+
elif status == "interrupted":
|
| 309 |
+
print("⏸ POLLING INTERRUPTED")
|
| 310 |
+
print(f"Job is still running. Check status manually:")
|
| 311 |
+
print(f" GET {BASE_URL}/jobs/{request_id}/status")
|
| 312 |
+
sys.exit(0)
|
| 313 |
+
|
| 314 |
+
else:
|
| 315 |
+
print("⏱ JOB STILL IN PROGRESS")
|
| 316 |
+
print(f"Check status manually: GET {BASE_URL}/jobs/{request_id}/status")
|
| 317 |
+
sys.exit(1)
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
if __name__ == "__main__":
|
| 321 |
+
main()
|
api/test_get_google_token.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Helper script to get Google Drive OAuth token for testing.
|
| 3 |
+
|
| 4 |
+
This script implements the OAuth flow to get access and refresh tokens
|
| 5 |
+
from Google Drive API for testing purposes.
|
| 6 |
+
|
| 7 |
+
Prerequisites:
|
| 8 |
+
1. Google Cloud Project with Drive API enabled
|
| 9 |
+
2. OAuth 2.0 Client ID credentials
|
| 10 |
+
3. Add http://localhost:8080 as authorized redirect URI
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python test_get_google_token.py --client-id YOUR_CLIENT_ID --client-secret YOUR_CLIENT_SECRET
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import webbrowser
|
| 18 |
+
from urllib.parse import urlencode, parse_qs
|
| 19 |
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 20 |
+
import requests
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Global variable to store authorization code
|
| 24 |
+
auth_code = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class OAuthCallbackHandler(BaseHTTPRequestHandler):
|
| 28 |
+
"""HTTP server to handle OAuth callback"""
|
| 29 |
+
|
| 30 |
+
def do_GET(self):
|
| 31 |
+
global auth_code
|
| 32 |
+
|
| 33 |
+
# Parse query parameters
|
| 34 |
+
query = self.path.split('?', 1)[-1]
|
| 35 |
+
params = parse_qs(query)
|
| 36 |
+
|
| 37 |
+
if 'code' in params:
|
| 38 |
+
auth_code = params['code'][0]
|
| 39 |
+
|
| 40 |
+
# Send success response
|
| 41 |
+
self.send_response(200)
|
| 42 |
+
self.send_header('Content-type', 'text/html')
|
| 43 |
+
self.end_headers()
|
| 44 |
+
|
| 45 |
+
html = """
|
| 46 |
+
<html>
|
| 47 |
+
<head><title>Authorization Successful</title></head>
|
| 48 |
+
<body style="font-family: Arial; text-align: center; padding: 50px;">
|
| 49 |
+
<h1 style="color: green;">✓ Authorization Successful!</h1>
|
| 50 |
+
<p>You can close this window and return to the terminal.</p>
|
| 51 |
+
</body>
|
| 52 |
+
</html>
|
| 53 |
+
"""
|
| 54 |
+
self.wfile.write(html.encode())
|
| 55 |
+
else:
|
| 56 |
+
# Error response
|
| 57 |
+
self.send_response(400)
|
| 58 |
+
self.send_header('Content-type', 'text/html')
|
| 59 |
+
self.end_headers()
|
| 60 |
+
|
| 61 |
+
error = params.get('error', ['Unknown error'])[0]
|
| 62 |
+
html = f"""
|
| 63 |
+
<html>
|
| 64 |
+
<head><title>Authorization Failed</title></head>
|
| 65 |
+
<body style="font-family: Arial; text-align: center; padding: 50px;">
|
| 66 |
+
<h1 style="color: red;">✗ Authorization Failed</h1>
|
| 67 |
+
<p>Error: {error}</p>
|
| 68 |
+
<p>Please try again.</p>
|
| 69 |
+
</body>
|
| 70 |
+
</html>
|
| 71 |
+
"""
|
| 72 |
+
self.wfile.write(html.encode())
|
| 73 |
+
|
| 74 |
+
def log_message(self, format, *args):
|
| 75 |
+
"""Suppress default logging"""
|
| 76 |
+
pass
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_google_drive_token(client_id: str, client_secret: str, redirect_uri: str = "http://localhost:8080"):
|
| 80 |
+
"""
|
| 81 |
+
Get Google Drive OAuth tokens through OAuth flow.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
client_id: Google OAuth client ID
|
| 85 |
+
client_secret: Google OAuth client secret
|
| 86 |
+
redirect_uri: OAuth redirect URI (must match Google Cloud Console)
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
dict with 'access_token' and 'refresh_token'
|
| 90 |
+
"""
|
| 91 |
+
global auth_code
|
| 92 |
+
|
| 93 |
+
print("=" * 80)
|
| 94 |
+
print(" " * 20 + "GOOGLE DRIVE OAUTH TOKEN GENERATOR")
|
| 95 |
+
print("=" * 80)
|
| 96 |
+
print()
|
| 97 |
+
|
| 98 |
+
# Step 1: Generate authorization URL
|
| 99 |
+
auth_params = {
|
| 100 |
+
'client_id': client_id,
|
| 101 |
+
'redirect_uri': redirect_uri,
|
| 102 |
+
'response_type': 'code',
|
| 103 |
+
'scope': 'https://www.googleapis.com/auth/drive.file',
|
| 104 |
+
'access_type': 'offline', # Get refresh token
|
| 105 |
+
'prompt': 'consent' # Force consent to get refresh token
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
auth_url = f"https://accounts.google.com/o/oauth2/v2/auth?{urlencode(auth_params)}"
|
| 109 |
+
|
| 110 |
+
print("Step 1: Authorize with Google")
|
| 111 |
+
print("-" * 80)
|
| 112 |
+
print("\nOpening authorization URL in your browser...")
|
| 113 |
+
print("If it doesn't open automatically, copy this URL:\n")
|
| 114 |
+
print(auth_url)
|
| 115 |
+
print()
|
| 116 |
+
|
| 117 |
+
# Open browser
|
| 118 |
+
webbrowser.open(auth_url)
|
| 119 |
+
|
| 120 |
+
# Step 2: Start local server to receive callback
|
| 121 |
+
print("Step 2: Waiting for authorization...")
|
| 122 |
+
print("-" * 80)
|
| 123 |
+
print(f"Local server listening on {redirect_uri}")
|
| 124 |
+
print("Complete the authorization in your browser.")
|
| 125 |
+
print()
|
| 126 |
+
|
| 127 |
+
server = HTTPServer(('localhost', 8080), OAuthCallbackHandler)
|
| 128 |
+
|
| 129 |
+
# Wait for one request (the callback)
|
| 130 |
+
while auth_code is None:
|
| 131 |
+
server.handle_request()
|
| 132 |
+
|
| 133 |
+
server.server_close()
|
| 134 |
+
|
| 135 |
+
if not auth_code:
|
| 136 |
+
print("✗ Failed to get authorization code")
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
print("✓ Authorization code received!")
|
| 140 |
+
print()
|
| 141 |
+
|
| 142 |
+
# Step 3: Exchange code for tokens
|
| 143 |
+
print("Step 3: Exchanging code for tokens...")
|
| 144 |
+
print("-" * 80)
|
| 145 |
+
|
| 146 |
+
token_url = "https://oauth2.googleapis.com/token"
|
| 147 |
+
token_data = {
|
| 148 |
+
'code': auth_code,
|
| 149 |
+
'client_id': client_id,
|
| 150 |
+
'client_secret': client_secret,
|
| 151 |
+
'redirect_uri': redirect_uri,
|
| 152 |
+
'grant_type': 'authorization_code'
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
response = requests.post(token_url, data=token_data)
|
| 157 |
+
response.raise_for_status()
|
| 158 |
+
tokens = response.json()
|
| 159 |
+
|
| 160 |
+
print("✓ Tokens received!")
|
| 161 |
+
print()
|
| 162 |
+
print("=" * 80)
|
| 163 |
+
print(" " * 30 + "TOKENS")
|
| 164 |
+
print("=" * 80)
|
| 165 |
+
print()
|
| 166 |
+
print("Access Token:")
|
| 167 |
+
print(tokens['access_token'])
|
| 168 |
+
print()
|
| 169 |
+
|
| 170 |
+
if 'refresh_token' in tokens:
|
| 171 |
+
print("Refresh Token:")
|
| 172 |
+
print(tokens['refresh_token'])
|
| 173 |
+
print()
|
| 174 |
+
else:
|
| 175 |
+
print("⚠ No refresh token received (user may have authorized before)")
|
| 176 |
+
print(" To get a refresh token:")
|
| 177 |
+
print(" 1. Go to: https://myaccount.google.com/permissions")
|
| 178 |
+
print(" 2. Remove your app's access")
|
| 179 |
+
print(" 3. Run this script again")
|
| 180 |
+
print()
|
| 181 |
+
|
| 182 |
+
print("Expires In: {} seconds".format(tokens.get('expires_in', 'N/A')))
|
| 183 |
+
print()
|
| 184 |
+
|
| 185 |
+
# Show usage instructions
|
| 186 |
+
print("=" * 80)
|
| 187 |
+
print(" " * 25 + "USAGE INSTRUCTIONS")
|
| 188 |
+
print("=" * 80)
|
| 189 |
+
print()
|
| 190 |
+
print("Option 1: Use with test script directly")
|
| 191 |
+
print("-" * 80)
|
| 192 |
+
print("python test_async_api.py \\")
|
| 193 |
+
print(f" --google-token {tokens['access_token']}")
|
| 194 |
+
if 'refresh_token' in tokens:
|
| 195 |
+
print(f" --google-refresh-token {tokens['refresh_token']}")
|
| 196 |
+
print()
|
| 197 |
+
|
| 198 |
+
print("Option 2: Set environment variable")
|
| 199 |
+
print("-" * 80)
|
| 200 |
+
print(f"export GOOGLE_DRIVE_TOKEN=\"{tokens['access_token']}\"")
|
| 201 |
+
if 'refresh_token' in tokens:
|
| 202 |
+
print(f"export GOOGLE_DRIVE_REFRESH_TOKEN=\"{tokens['refresh_token']}\"")
|
| 203 |
+
print("python test_async_api.py")
|
| 204 |
+
print()
|
| 205 |
+
|
| 206 |
+
print("Option 3: Use in your frontend")
|
| 207 |
+
print("-" * 80)
|
| 208 |
+
print("Store these tokens in your frontend application and include them")
|
| 209 |
+
print("in API requests to /generate/async endpoint.")
|
| 210 |
+
print()
|
| 211 |
+
|
| 212 |
+
print("=" * 80)
|
| 213 |
+
|
| 214 |
+
return tokens
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
print(f"✗ Failed to exchange code for tokens: {e}")
|
| 218 |
+
if hasattr(e, 'response') and e.response:
|
| 219 |
+
print(f"Response: {e.response.text}")
|
| 220 |
+
return None
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def main():
|
| 224 |
+
parser = argparse.ArgumentParser(
|
| 225 |
+
description="Get Google Drive OAuth token for testing"
|
| 226 |
+
)
|
| 227 |
+
parser.add_argument(
|
| 228 |
+
"--client-id",
|
| 229 |
+
type=str,
|
| 230 |
+
required=True,
|
| 231 |
+
help="Google OAuth Client ID"
|
| 232 |
+
)
|
| 233 |
+
parser.add_argument(
|
| 234 |
+
"--client-secret",
|
| 235 |
+
type=str,
|
| 236 |
+
required=True,
|
| 237 |
+
help="Google OAuth Client Secret"
|
| 238 |
+
)
|
| 239 |
+
parser.add_argument(
|
| 240 |
+
"--redirect-uri",
|
| 241 |
+
type=str,
|
| 242 |
+
default="http://localhost:8080",
|
| 243 |
+
help="OAuth redirect URI (default: http://localhost:8080)"
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
args = parser.parse_args()
|
| 247 |
+
|
| 248 |
+
print()
|
| 249 |
+
print("Prerequisites Check:")
|
| 250 |
+
print("-" * 80)
|
| 251 |
+
print(f"✓ Client ID: {args.client_id[:20]}...")
|
| 252 |
+
print(f"✓ Client Secret: {args.client_secret[:10]}...")
|
| 253 |
+
print(f"✓ Redirect URI: {args.redirect_uri}")
|
| 254 |
+
print()
|
| 255 |
+
print("Make sure you've added this redirect URI to your Google Cloud Console:")
|
| 256 |
+
print(" https://console.cloud.google.com/apis/credentials")
|
| 257 |
+
print()
|
| 258 |
+
input("Press Enter to continue...")
|
| 259 |
+
print()
|
| 260 |
+
|
| 261 |
+
tokens = get_google_drive_token(
|
| 262 |
+
client_id=args.client_id,
|
| 263 |
+
client_secret=args.client_secret,
|
| 264 |
+
redirect_uri=args.redirect_uri
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
if tokens:
|
| 268 |
+
print("✓ SUCCESS! Use the tokens above to test the async API.")
|
| 269 |
+
else:
|
| 270 |
+
print("✗ FAILED to get tokens. Please check your credentials and try again.")
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
main()
|
api/test_runpod_integration.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test script to verify RunPod handwriting service integration.
|
| 3 |
+
This script tests the integration between the API and the deployed RunPod service.
|
| 4 |
+
"""
|
| 5 |
+
import asyncio
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 11 |
+
|
| 12 |
+
from .utils import call_handwriting_service_batch
|
| 13 |
+
from .config import settings
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
async def test_runpod_integration():
|
| 17 |
+
"""Test the RunPod handwriting service integration"""
|
| 18 |
+
|
| 19 |
+
print("=" * 80)
|
| 20 |
+
print("RunPod Handwriting Service Integration Test")
|
| 21 |
+
print("=" * 80)
|
| 22 |
+
|
| 23 |
+
# Check configuration
|
| 24 |
+
print("\n1. Configuration:")
|
| 25 |
+
print(f" - HANDWRITING_SERVICE_URL: {settings.HANDWRITING_SERVICE_URL}")
|
| 26 |
+
print(f" - HANDWRITING_SERVICE_ENABLED: {settings.HANDWRITING_SERVICE_ENABLED}")
|
| 27 |
+
print(f" - HANDWRITING_SERVICE_TIMEOUT: {settings.HANDWRITING_SERVICE_TIMEOUT}s")
|
| 28 |
+
print(f" - HANDWRITING_SERVICE_MAX_RETRIES: {settings.HANDWRITING_SERVICE_MAX_RETRIES}")
|
| 29 |
+
print(f" - RUNPOD_API_KEY: {'Set' if settings.RUNPOD_API_KEY else 'Not set (optional)'}")
|
| 30 |
+
print(f" - HANDWRITING_APPLY_BLUR: {settings.HANDWRITING_APPLY_BLUR}")
|
| 31 |
+
|
| 32 |
+
if not settings.HANDWRITING_SERVICE_ENABLED:
|
| 33 |
+
print("\n❌ HANDWRITING_SERVICE_ENABLED is false. Please enable it in .env")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
# Prepare test data
|
| 37 |
+
test_texts = [
|
| 38 |
+
{
|
| 39 |
+
"text": "Hello",
|
| 40 |
+
"author_id": 42,
|
| 41 |
+
"hw_id": "test_hw_0"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"text": "World",
|
| 45 |
+
"author_id": 42,
|
| 46 |
+
"hw_id": "test_hw_1"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"text": "DocGenie",
|
| 50 |
+
"author_id": 100,
|
| 51 |
+
"hw_id": "test_hw_2"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"text": "Batch",
|
| 55 |
+
"author_id": 150,
|
| 56 |
+
"hw_id": "test_hw_3"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"text": "Processing",
|
| 60 |
+
"author_id": 200,
|
| 61 |
+
"hw_id": "test_hw_4"
|
| 62 |
+
}
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
print(f"\n2. Testing TRUE BATCH PROCESSING (cost-efficient):")
|
| 66 |
+
print(f" - {len(test_texts)} texts will be sent in ONE request")
|
| 67 |
+
print(f" - Activates ONLY 1 RunPod worker (instead of {len(test_texts)} workers)")
|
| 68 |
+
print(f" - Expected cost savings: ~45% compared to parallel processing")
|
| 69 |
+
for text in test_texts:
|
| 70 |
+
print(f" - '{text['text']}' (author_id: {text['author_id']})")
|
| 71 |
+
|
| 72 |
+
# Call the service
|
| 73 |
+
print("\n3. Calling RunPod service with BATCH request...")
|
| 74 |
+
import time
|
| 75 |
+
start_time = time.time()
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
results = await call_handwriting_service_batch(test_texts)
|
| 79 |
+
|
| 80 |
+
elapsed = time.time() - start_time
|
| 81 |
+
|
| 82 |
+
print(f"\n4. Results:")
|
| 83 |
+
print(f" - Successfully generated: {len(results)}/{len(test_texts)}")
|
| 84 |
+
print(f" - Total time: {elapsed:.1f}s ({elapsed/len(results):.1f}s per text)")
|
| 85 |
+
print(f" - Worker activations: 1 (would be {len(test_texts)} with old parallel method)")
|
| 86 |
+
|
| 87 |
+
if results:
|
| 88 |
+
print("\n5. Sample result details:")
|
| 89 |
+
for i, result in enumerate(results[:2]): # Show first 2 results
|
| 90 |
+
print(f"\n Result {i+1}:")
|
| 91 |
+
print(f" - hw_id: {result.get('hw_id')}")
|
| 92 |
+
print(f" - text: {result.get('text')}")
|
| 93 |
+
print(f" - author_id: {result.get('author_id')}")
|
| 94 |
+
print(f" - width: {result.get('width')}px")
|
| 95 |
+
print(f" - height: {result.get('height')}px")
|
| 96 |
+
print(f" - image_base64: {result.get('image_base64')[:50]}... ({len(result.get('image_base64', ''))} chars)")
|
| 97 |
+
|
| 98 |
+
print("\n" + "=" * 80)
|
| 99 |
+
print("✅ BATCH PROCESSING TEST PASSED!")
|
| 100 |
+
print("=" * 80)
|
| 101 |
+
print("\nCost Analysis:")
|
| 102 |
+
print(f" OLD (parallel): {len(test_texts)} workers × 18s = {len(test_texts) * 18}s total worker time")
|
| 103 |
+
print(f" NEW (batched): 1 worker × {int(elapsed)}s = {int(elapsed)}s total worker time")
|
| 104 |
+
print(f" Savings: ~{int((1 - elapsed / (len(test_texts) * 18)) * 100)}% reduction in worker activation costs")
|
| 105 |
+
print("\nThe API now sends all {len(test_texts)} texts in ONE request, activating only 1 worker.")
|
| 106 |
+
print("This significantly reduces RunPod costs while maintaining quality.")
|
| 107 |
+
else:
|
| 108 |
+
print("\n⚠️ No results returned. Check the error messages above.")
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"\n❌ Integration test FAILED!")
|
| 112 |
+
print(f"Error: {e}")
|
| 113 |
+
import traceback
|
| 114 |
+
traceback.print_exc()
|
| 115 |
+
print("\nPossible issues:")
|
| 116 |
+
print("1. Check that HANDWRITING_SERVICE_URL in .env is correct")
|
| 117 |
+
print("2. Verify the RunPod endpoint is deployed with v12 (batch support)")
|
| 118 |
+
print("3. Check if RUNPOD_API_KEY is required and set correctly")
|
| 119 |
+
print("4. Ensure the service handler supports batch input format")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
asyncio.run(test_runpod_integration())
|
api/test_sync_pdf_api.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for /generate/pdf endpoint (sync API with tracking & GDrive upload).
|
| 4 |
+
|
| 5 |
+
Tests the complete flow with all features enabled:
|
| 6 |
+
- Handwriting insertion
|
| 7 |
+
- Visual elements (stamps, logos, figures, barcodes, photos)
|
| 8 |
+
- OCR processing
|
| 9 |
+
- Ground truth verification
|
| 10 |
+
- Analysis and debug visualization
|
| 11 |
+
- Dataset export
|
| 12 |
+
- Google Drive upload
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
python test_sync_pdf_api.py
|
| 16 |
+
|
| 17 |
+
The script uses hardcoded tokens and polls continuously for status updates.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import requests
|
| 21 |
+
import time
|
| 22 |
+
import sys
|
| 23 |
+
import zipfile
|
| 24 |
+
import io
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Configuration
|
| 28 |
+
BASE_URL = "http://localhost:8000"
|
| 29 |
+
POLL_INTERVAL = 10 # seconds between status checks
|
| 30 |
+
|
| 31 |
+
# Test payload with all features enabled
|
| 32 |
+
PAYLOAD = {
|
| 33 |
+
"user_id": 123,
|
| 34 |
+
"google_drive_token": "ya29.a0ATkoCc5wSA3DqNSI35d2EOCfLku0NWULKJYNMPhngjTwcnEKrcNcut1vawhiErgauHc85BrZdF5pug1xzp9Zu1oWATlzIMrMo5jqKDaXWThC0GuRifayOstjOetZnRLPRxVlmjx4k_xm7rto_pN6mT1CUrnte0Qkwf7FJVtF08JzJqaCG9Vvamag4OkkOhy-LB8MsUQaCgYKAXASARISFQHGX2MiAX_4jMvIlv2OkO7WurUUVA0206",
|
| 35 |
+
"google_drive_refresh_token": "1//03aLYGLUIYIl0CgYIARAAGAMSNwF-L9IrCfdJ-QHJHisqG86UjBvaEalyhWZdDcwbfLENt4V1ckik_wIkmsgjRwC9-SFeHrj-Yk4",
|
| 36 |
+
"seed_images": [
|
| 37 |
+
"https://ocr.space/Content/Images/receipt-ocr-original.webp"
|
| 38 |
+
],
|
| 39 |
+
"prompt_params": {
|
| 40 |
+
"language": "English",
|
| 41 |
+
"doc_type": "business and administrative",
|
| 42 |
+
"gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
|
| 43 |
+
"gt_format": "{\"<Text of question 1>\": \"<Answer to question 1>\", \"<Text of question 2>\": \"<Answer to question 2>\", ...}",
|
| 44 |
+
"num_solutions": 1,
|
| 45 |
+
"enable_handwriting": True,
|
| 46 |
+
"handwriting_ratio": 0.3,
|
| 47 |
+
"enable_visual_elements": True,
|
| 48 |
+
"visual_element_types": [
|
| 49 |
+
"stamp",
|
| 50 |
+
"logo",
|
| 51 |
+
"figure",
|
| 52 |
+
"barcode",
|
| 53 |
+
"photo"
|
| 54 |
+
],
|
| 55 |
+
"seed": None, # Use None for random behavior, or set to integer for reproducibility
|
| 56 |
+
"enable_ocr": True,
|
| 57 |
+
"ocr_language": "en",
|
| 58 |
+
"enable_bbox_normalization": True,
|
| 59 |
+
"enable_gt_verification": True,
|
| 60 |
+
"enable_analysis": True,
|
| 61 |
+
"enable_debug_visualization": True,
|
| 62 |
+
"enable_dataset_export": True,
|
| 63 |
+
"dataset_export_format": "msgpack",
|
| 64 |
+
"output_detail": "dataset"
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_health():
|
| 70 |
+
"""Test API health endpoint"""
|
| 71 |
+
print("=" * 80)
|
| 72 |
+
print("TESTING API HEALTH")
|
| 73 |
+
print("=" * 80)
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
response = requests.get(f"{BASE_URL}/health", timeout=5)
|
| 77 |
+
response.raise_for_status()
|
| 78 |
+
print(f"✓ API is healthy: {response.json()}\n")
|
| 79 |
+
return True
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"✗ Health check failed: {e}\n")
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def test_sync_endpoint():
|
| 86 |
+
"""Test sync /generate/pdf endpoint with continuous polling"""
|
| 87 |
+
print("=" * 80)
|
| 88 |
+
print("TESTING SYNC /generate/pdf ENDPOINT")
|
| 89 |
+
print("=" * 80)
|
| 90 |
+
print("\nConfiguration:")
|
| 91 |
+
print(f" User ID: {PAYLOAD['user_id']}")
|
| 92 |
+
print(f" Seed Images: {len(PAYLOAD['seed_images'])}")
|
| 93 |
+
print(f" Num Solutions: {PAYLOAD['prompt_params']['num_solutions']}")
|
| 94 |
+
print(f" Handwriting: {PAYLOAD['prompt_params']['enable_handwriting']} (ratio: {PAYLOAD['prompt_params']['handwriting_ratio']})")
|
| 95 |
+
print(f" Visual Elements: {PAYLOAD['prompt_params']['enable_visual_elements']} (types: {len(PAYLOAD['prompt_params']['visual_element_types'])})")
|
| 96 |
+
print(f" OCR: {PAYLOAD['prompt_params']['enable_ocr']}")
|
| 97 |
+
print(f" GT Verification: {PAYLOAD['prompt_params']['enable_gt_verification']}")
|
| 98 |
+
print(f" Analysis: {PAYLOAD['prompt_params']['enable_analysis']}")
|
| 99 |
+
print(f" Debug Viz: {PAYLOAD['prompt_params']['enable_debug_visualization']}")
|
| 100 |
+
print(f" Dataset Export: {PAYLOAD['prompt_params']['enable_dataset_export']}")
|
| 101 |
+
print(f" Google Drive Upload: Yes")
|
| 102 |
+
print()
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
print("⏳ Calling /generate/pdf...")
|
| 106 |
+
print(" (This will return immediately, then we'll poll for status)\n")
|
| 107 |
+
start_time = time.time()
|
| 108 |
+
|
| 109 |
+
response = requests.post(
|
| 110 |
+
f"{BASE_URL}/generate/pdf",
|
| 111 |
+
json=PAYLOAD,
|
| 112 |
+
timeout=180, # 3 minutes max for initial response
|
| 113 |
+
stream=True
|
| 114 |
+
)
|
| 115 |
+
response.raise_for_status()
|
| 116 |
+
|
| 117 |
+
elapsed_time = time.time() - start_time
|
| 118 |
+
|
| 119 |
+
# Check response headers
|
| 120 |
+
print(f"✓ Response received in {elapsed_time:.1f} seconds")
|
| 121 |
+
print("\nResponse Headers:")
|
| 122 |
+
|
| 123 |
+
request_id = response.headers.get('X-Request-ID')
|
| 124 |
+
status_url = response.headers.get('X-Status-URL')
|
| 125 |
+
|
| 126 |
+
if request_id:
|
| 127 |
+
print(f" ✓ X-Request-ID: {request_id}")
|
| 128 |
+
else:
|
| 129 |
+
print(f" ⚠ X-Request-ID: NOT SET")
|
| 130 |
+
|
| 131 |
+
if status_url:
|
| 132 |
+
print(f" ✓ X-Status-URL: {status_url}")
|
| 133 |
+
else:
|
| 134 |
+
print(f" ⚠ X-Status-URL: NOT SET")
|
| 135 |
+
|
| 136 |
+
# Verify ZIP file
|
| 137 |
+
zip_data = response.content
|
| 138 |
+
zip_size_mb = len(zip_data) / (1024 * 1024)
|
| 139 |
+
print(f"\n✓ ZIP file size: {zip_size_mb:.2f} MB")
|
| 140 |
+
|
| 141 |
+
# Validate ZIP structure
|
| 142 |
+
try:
|
| 143 |
+
zip_buffer = io.BytesIO(zip_data)
|
| 144 |
+
with zipfile.ZipFile(zip_buffer, 'r') as zip_file:
|
| 145 |
+
file_list = zip_file.namelist()
|
| 146 |
+
print(f"✓ ZIP contains {len(file_list)} files")
|
| 147 |
+
|
| 148 |
+
# Show directory structure
|
| 149 |
+
print("\nDataset Structure:")
|
| 150 |
+
dirs = set()
|
| 151 |
+
for filepath in file_list:
|
| 152 |
+
parts = filepath.split('/')
|
| 153 |
+
if len(parts) > 1:
|
| 154 |
+
dirs.add(parts[0] + '/' + parts[1] if len(parts) > 2 else parts[0])
|
| 155 |
+
|
| 156 |
+
for dir_name in sorted(dirs):
|
| 157 |
+
file_count = sum(1 for f in file_list if f.startswith(dir_name + '/') and f != dir_name + '/')
|
| 158 |
+
if file_count > 0:
|
| 159 |
+
print(f" 📁 {dir_name}/ ({file_count} files)")
|
| 160 |
+
|
| 161 |
+
# Check for essential files
|
| 162 |
+
if 'docgenie_documents/metadata.json' in file_list:
|
| 163 |
+
print("\n ✓ metadata.json present")
|
| 164 |
+
if 'docgenie_documents/README.md' in file_list:
|
| 165 |
+
print(" ✓ README.md present")
|
| 166 |
+
|
| 167 |
+
except zipfile.BadZipFile as e:
|
| 168 |
+
print(f"✗ Invalid ZIP file: {e}")
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
# Continuous polling if we have request_id
|
| 172 |
+
if request_id:
|
| 173 |
+
print("\n" + "=" * 80)
|
| 174 |
+
print("CONTINUOUS STATUS POLLING")
|
| 175 |
+
print("=" * 80)
|
| 176 |
+
print(f"Request ID: {request_id}")
|
| 177 |
+
print(f"Polling every {POLL_INTERVAL} seconds...\n")
|
| 178 |
+
|
| 179 |
+
poll_count = 0
|
| 180 |
+
last_status = None
|
| 181 |
+
last_progress = None
|
| 182 |
+
|
| 183 |
+
while True:
|
| 184 |
+
poll_count += 1
|
| 185 |
+
timestamp = time.strftime("%H:%M:%S")
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
status_response = requests.get(
|
| 189 |
+
f"{BASE_URL}/jobs/{request_id}/status",
|
| 190 |
+
timeout=10
|
| 191 |
+
)
|
| 192 |
+
status_response.raise_for_status()
|
| 193 |
+
status_data = status_response.json()
|
| 194 |
+
|
| 195 |
+
current_status = status_data.get('status')
|
| 196 |
+
current_progress = status_data.get('progress')
|
| 197 |
+
|
| 198 |
+
# Only print if status or progress changed
|
| 199 |
+
if current_status != last_status or current_progress != last_progress:
|
| 200 |
+
print(f"[{timestamp}] Poll #{poll_count}: {current_status.upper()}", end="")
|
| 201 |
+
if current_progress:
|
| 202 |
+
print(f" - {current_progress}", end="")
|
| 203 |
+
print()
|
| 204 |
+
|
| 205 |
+
last_status = current_status
|
| 206 |
+
last_progress = current_progress
|
| 207 |
+
|
| 208 |
+
# Check for terminal states
|
| 209 |
+
if current_status == "completed":
|
| 210 |
+
print("\n" + "=" * 80)
|
| 211 |
+
print("✓ JOB COMPLETED!")
|
| 212 |
+
print("=" * 80)
|
| 213 |
+
|
| 214 |
+
results = status_data.get('results', {})
|
| 215 |
+
download_url = results.get('download_url')
|
| 216 |
+
|
| 217 |
+
if download_url:
|
| 218 |
+
print(f" ✓ Google Drive URL: {download_url}")
|
| 219 |
+
else:
|
| 220 |
+
print(f" ⏳ Google Drive upload may still be in progress")
|
| 221 |
+
|
| 222 |
+
if results.get('file_size_mb'):
|
| 223 |
+
print(f" File Size: {results['file_size_mb']:.2f} MB")
|
| 224 |
+
|
| 225 |
+
print(f" Document Count: {results.get('document_count', 'N/A')}")
|
| 226 |
+
print(f" Created: {status_data.get('created_at')}")
|
| 227 |
+
print(f" Completed: {status_data.get('updated_at')}")
|
| 228 |
+
|
| 229 |
+
break
|
| 230 |
+
|
| 231 |
+
elif current_status == "failed":
|
| 232 |
+
print("\n" + "=" * 80)
|
| 233 |
+
print("✗ JOB FAILED!")
|
| 234 |
+
print("=" * 80)
|
| 235 |
+
print(f" Error: {status_data.get('error_message', 'Unknown error')}")
|
| 236 |
+
return False
|
| 237 |
+
|
| 238 |
+
# Wait before next poll
|
| 239 |
+
time.sleep(POLL_INTERVAL)
|
| 240 |
+
|
| 241 |
+
except KeyboardInterrupt:
|
| 242 |
+
print("\n\n⚠ Polling interrupted by user")
|
| 243 |
+
print(f"You can continue polling manually:")
|
| 244 |
+
print(f" GET {BASE_URL}/jobs/{request_id}/status")
|
| 245 |
+
break
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"\n⚠ Error polling status: {e}")
|
| 249 |
+
time.sleep(POLL_INTERVAL)
|
| 250 |
+
|
| 251 |
+
print("\n" + "=" * 80)
|
| 252 |
+
print("✅ TEST COMPLETED SUCCESSFULLY")
|
| 253 |
+
print("=" * 80)
|
| 254 |
+
print(f"✓ ZIP received in {elapsed_time:.1f} seconds")
|
| 255 |
+
print(f"✓ ZIP size: {zip_size_mb:.2f} MB")
|
| 256 |
+
print(f"✓ Dataset structure validated")
|
| 257 |
+
print(f"✓ Google Drive upload tracked")
|
| 258 |
+
return True
|
| 259 |
+
|
| 260 |
+
except requests.exceptions.Timeout:
|
| 261 |
+
print(f"✗ Request timed out")
|
| 262 |
+
return False
|
| 263 |
+
except Exception as e:
|
| 264 |
+
print(f"✗ Test failed: {e}")
|
| 265 |
+
import traceback
|
| 266 |
+
traceback.print_exc()
|
| 267 |
+
return False
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def main():
|
| 271 |
+
print("\n" + "=" * 80)
|
| 272 |
+
print(" " * 15 + "SYNC PDF API TEST - FULL FEATURE SET")
|
| 273 |
+
print("=" * 80)
|
| 274 |
+
print(f"Base URL: {BASE_URL}")
|
| 275 |
+
print("=" * 80)
|
| 276 |
+
print()
|
| 277 |
+
|
| 278 |
+
# Step 1: Health check
|
| 279 |
+
if not test_health():
|
| 280 |
+
print("\n❌ API is not accessible. Make sure the server is running.")
|
| 281 |
+
print(f" Expected URL: {BASE_URL}")
|
| 282 |
+
sys.exit(1)
|
| 283 |
+
|
| 284 |
+
# Step 2: Test sync endpoint
|
| 285 |
+
success = test_sync_endpoint()
|
| 286 |
+
|
| 287 |
+
# Summary
|
| 288 |
+
print("\n" + "=" * 80)
|
| 289 |
+
print(" " * 30 + "SUMMARY")
|
| 290 |
+
print("=" * 80)
|
| 291 |
+
|
| 292 |
+
if success:
|
| 293 |
+
print("✅ ALL TESTS PASSED!")
|
| 294 |
+
print("\nFeatures tested:")
|
| 295 |
+
print(" ✓ Handwriting insertion")
|
| 296 |
+
print(" ✓ Visual elements (5 types)")
|
| 297 |
+
print(" ✓ OCR processing")
|
| 298 |
+
print(" ✓ Ground truth verification")
|
| 299 |
+
print(" ✓ Analysis & debug visualization")
|
| 300 |
+
print(" ✓ Dataset export")
|
| 301 |
+
print(" ✓ Google Drive upload")
|
| 302 |
+
print(" ✓ Continuous status polling")
|
| 303 |
+
else:
|
| 304 |
+
print("❌ TEST FAILED")
|
| 305 |
+
|
| 306 |
+
print("=" * 80)
|
| 307 |
+
|
| 308 |
+
sys.exit(0 if success else 1)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
if __name__ == "__main__":
|
| 312 |
+
main()
|
api/utils.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
api/worker.py
ADDED
|
@@ -0,0 +1,804 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Background worker for processing document generation jobs using batched Claude API.
|
| 3 |
+
Runs as RQ worker process.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import io
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import pathlib
|
| 11 |
+
import tempfile
|
| 12 |
+
import time
|
| 13 |
+
import traceback
|
| 14 |
+
import zipfile
|
| 15 |
+
import shutil
|
| 16 |
+
import math
|
| 17 |
+
from typing import Dict, Any, List, Callable
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
|
| 20 |
+
# Add worker startup logging
|
| 21 |
+
from .config import settings
|
| 22 |
+
|
| 23 |
+
from .supabase_client import supabase_client
|
| 24 |
+
from .google_drive import GoogleDriveClient
|
| 25 |
+
from .utils import (
|
| 26 |
+
download_seed_images,
|
| 27 |
+
build_prompt,
|
| 28 |
+
extract_html_documents_from_response,
|
| 29 |
+
extract_ground_truth,
|
| 30 |
+
extract_css_from_html,
|
| 31 |
+
render_html_to_pdf,
|
| 32 |
+
extract_bboxes_from_rendered_pdf,
|
| 33 |
+
pdf_to_base64,
|
| 34 |
+
process_stage3_complete,
|
| 35 |
+
process_stage4_ocr,
|
| 36 |
+
process_stage5_complete,
|
| 37 |
+
validate_html_structure,
|
| 38 |
+
validate_pdf,
|
| 39 |
+
validate_bboxes
|
| 40 |
+
)
|
| 41 |
+
from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
|
| 42 |
+
from docgenie import ENV
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ==================== Worker Logging Configuration ====================
|
| 46 |
+
# Read from environment variable, default to False for cleaner logs
|
| 47 |
+
VERBOSE_LOGGING = os.getenv('WORKER_VERBOSE_LOGGING', 'false').lower() in ('true', '1', 'yes')
|
| 48 |
+
|
| 49 |
+
def log_verbose(message: str):
|
| 50 |
+
"""Log message only if verbose logging is enabled"""
|
| 51 |
+
if VERBOSE_LOGGING:
|
| 52 |
+
print(message)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ==================== Startup Validation ====================
|
| 56 |
+
def validate_worker_config():
|
| 57 |
+
"""Validate worker configuration at startup"""
|
| 58 |
+
print("=" * 60)
|
| 59 |
+
print("🔧 Worker Configuration Check")
|
| 60 |
+
print("=" * 60)
|
| 61 |
+
|
| 62 |
+
# Check Anthropic API
|
| 63 |
+
if settings.ANTHROPIC_API_KEY:
|
| 64 |
+
print("✓ ANTHROPIC_API_KEY: Set")
|
| 65 |
+
else:
|
| 66 |
+
print("✗ ANTHROPIC_API_KEY: NOT SET (REQUIRED)")
|
| 67 |
+
|
| 68 |
+
# Check Supabase
|
| 69 |
+
if settings.SUPABASE_URL and settings.SUPABASE_KEY:
|
| 70 |
+
print(f"✓ SUPABASE: {settings.SUPABASE_URL[:30]}...")
|
| 71 |
+
else:
|
| 72 |
+
print("✗ SUPABASE: NOT SET (REQUIRED)")
|
| 73 |
+
|
| 74 |
+
# Check Google OAuth (optional, for token refresh)
|
| 75 |
+
if settings.GOOGLE_CLIENT_ID and settings.GOOGLE_CLIENT_SECRET:
|
| 76 |
+
print(f"✓ GOOGLE_CLIENT_ID: {settings.GOOGLE_CLIENT_ID[:20]}...")
|
| 77 |
+
print("✓ GOOGLE_CLIENT_SECRET: Set")
|
| 78 |
+
print(" → Token auto-refresh: ENABLED")
|
| 79 |
+
else:
|
| 80 |
+
print("⚠ GOOGLE_CLIENT_ID/SECRET: Not set")
|
| 81 |
+
print(" → Token auto-refresh: DISABLED")
|
| 82 |
+
print(" → Users must provide fresh access tokens that don't expire during processing")
|
| 83 |
+
|
| 84 |
+
print("=" * 60)
|
| 85 |
+
|
| 86 |
+
# Run validation on module import
|
| 87 |
+
validate_worker_config()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def retry_on_network_error(func: Callable, max_retries: int = 3, delay: float = 2.0) -> Any:
|
| 91 |
+
"""
|
| 92 |
+
Retry a function on network errors with exponential backoff.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
func: Function to execute (must be callable with no args)
|
| 96 |
+
max_retries: Maximum number of retry attempts
|
| 97 |
+
delay: Initial delay in seconds (doubles each retry)
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Result of the function call
|
| 101 |
+
|
| 102 |
+
Raises:
|
| 103 |
+
Last exception if all retries fail
|
| 104 |
+
"""
|
| 105 |
+
last_exception = None
|
| 106 |
+
for attempt in range(max_retries):
|
| 107 |
+
try:
|
| 108 |
+
return func()
|
| 109 |
+
except Exception as e:
|
| 110 |
+
last_exception = e
|
| 111 |
+
error_str = str(e).lower()
|
| 112 |
+
# Retry on network/DNS errors
|
| 113 |
+
if any(err in error_str for err in ['name resolution', 'connection', 'timeout', 'network']):
|
| 114 |
+
if attempt < max_retries - 1:
|
| 115 |
+
wait_time = delay * (2 ** attempt)
|
| 116 |
+
print(f"[Retry {attempt + 1}/{max_retries}] Network error, retrying in {wait_time}s: {e}")
|
| 117 |
+
time.sleep(wait_time)
|
| 118 |
+
continue
|
| 119 |
+
# Non-network error or last attempt
|
| 120 |
+
raise
|
| 121 |
+
# All retries exhausted
|
| 122 |
+
raise last_exception
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
async def process_document_generation_job_async(request_id: str, request_data: Dict[str, Any]):
|
| 126 |
+
"""
|
| 127 |
+
Async background job function - processes document generation using batched Claude API.
|
| 128 |
+
|
| 129 |
+
This function:
|
| 130 |
+
1. Creates Claude batch with single message (generates N documents)
|
| 131 |
+
2. Polls batch until completion
|
| 132 |
+
3. Processes all documents (PDFs, handwriting, etc.)
|
| 133 |
+
4. Uploads ZIP to user's Google Drive
|
| 134 |
+
5. Updates Supabase with results
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
request_id: Document request UUID from Supabase
|
| 138 |
+
request_data: Request parameters dict containing:
|
| 139 |
+
- user_id: int
|
| 140 |
+
- seed_images: List[str] (URLs)
|
| 141 |
+
- prompt_params: Dict (language, doc_type, num_solutions, etc.)
|
| 142 |
+
|
| 143 |
+
Raises:
|
| 144 |
+
Exception: Any error during processing (logged to Supabase)
|
| 145 |
+
"""
|
| 146 |
+
user_id = request_data['user_id']
|
| 147 |
+
google_drive_token = request_data.get('google_drive_token')
|
| 148 |
+
google_drive_refresh_token = request_data.get('google_drive_refresh_token')
|
| 149 |
+
seed_image_urls = request_data['seed_images']
|
| 150 |
+
prompt_params = request_data['prompt_params']
|
| 151 |
+
|
| 152 |
+
# Validate Google Drive credentials configuration
|
| 153 |
+
if google_drive_refresh_token:
|
| 154 |
+
if not settings.GOOGLE_CLIENT_ID or not settings.GOOGLE_CLIENT_SECRET:
|
| 155 |
+
print(f"[Job {request_id}] ⚠️ WARNING: refresh_token provided but GOOGLE_CLIENT_ID/SECRET not configured")
|
| 156 |
+
print(f"[Job {request_id}] Token auto-refresh will fail. Ensure access token remains valid.")
|
| 157 |
+
|
| 158 |
+
# Create temporary directories for this job
|
| 159 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 160 |
+
tmp_path = pathlib.Path(tmp_dir)
|
| 161 |
+
batch_dir = tmp_path / "batches"
|
| 162 |
+
message_dir = tmp_path / "messages"
|
| 163 |
+
batch_dir.mkdir(exist_ok=True)
|
| 164 |
+
message_dir.mkdir(exist_ok=True)
|
| 165 |
+
|
| 166 |
+
# Initialize DatasetExporter for organized structure
|
| 167 |
+
from .dataset_exporter import DatasetExporter
|
| 168 |
+
exporter = DatasetExporter(tmp_path, dataset_name="docgenie_documents")
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
# ==================== Update Status: Downloading ====================
|
| 172 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "downloading"))
|
| 173 |
+
print(f"[Job {request_id}] Status: downloading (fetching seed images)")
|
| 174 |
+
|
| 175 |
+
# ==================== Step 1: Download Seed Images ====================
|
| 176 |
+
log_verbose(f"[Job {request_id}] Downloading {len(seed_image_urls)} seed images...")
|
| 177 |
+
seed_images_base64 = download_seed_images(seed_image_urls)
|
| 178 |
+
log_verbose(f"[Job {request_id}] Downloaded {len(seed_images_base64)} images")
|
| 179 |
+
|
| 180 |
+
# ==================== Step 2: Build Prompts (Chunked) ====================
|
| 181 |
+
prompt_template_path = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
|
| 182 |
+
if not prompt_template_path.exists():
|
| 183 |
+
raise FileNotFoundError(f"Prompt template not found: {prompt_template_path}")
|
| 184 |
+
|
| 185 |
+
num_solutions = prompt_params.get('num_solutions', 1)
|
| 186 |
+
chunk_size = settings.BATCH_PROMPT_CHUNK_SIZE
|
| 187 |
+
num_prompts = math.ceil(num_solutions / chunk_size)
|
| 188 |
+
|
| 189 |
+
prompts = []
|
| 190 |
+
images_base64_list = []
|
| 191 |
+
image_docids_list = []
|
| 192 |
+
|
| 193 |
+
for i in range(num_prompts):
|
| 194 |
+
# Calculate how many solutions for this specific prompt
|
| 195 |
+
current_num_solutions = min(chunk_size, num_solutions - (i * chunk_size))
|
| 196 |
+
|
| 197 |
+
p = build_prompt(
|
| 198 |
+
language=prompt_params.get('language', 'English'),
|
| 199 |
+
doc_type=prompt_params.get('doc_type', 'business and administrative'),
|
| 200 |
+
gt_type=prompt_params.get('gt_type', 'Questions and answers'),
|
| 201 |
+
gt_format=prompt_params.get('gt_format', '{"question": "answer"}'),
|
| 202 |
+
num_solutions=current_num_solutions,
|
| 203 |
+
num_seed_images=len(seed_images_base64),
|
| 204 |
+
prompt_template_path=prompt_template_path,
|
| 205 |
+
enable_visual_elements=prompt_params.get('enable_visual_elements', False),
|
| 206 |
+
visual_element_types=prompt_params.get('visual_element_types', [])
|
| 207 |
+
)
|
| 208 |
+
prompts.append(p)
|
| 209 |
+
images_base64_list.append(seed_images_base64)
|
| 210 |
+
image_docids_list.append(["seed"] * len(seed_images_base64))
|
| 211 |
+
|
| 212 |
+
log_verbose(f"[Job {request_id}] Created {num_prompts} prompts (chunk size: {chunk_size})")
|
| 213 |
+
|
| 214 |
+
# ==================== Step 3: Create Claude Batch ====================
|
| 215 |
+
log_verbose(f"[Job {request_id}] Creating Claude batch with {num_prompts} messages...")
|
| 216 |
+
|
| 217 |
+
client = ClaudeBatchedClient(api_key=settings.ANTHROPIC_API_KEY)
|
| 218 |
+
|
| 219 |
+
# Send batch with multiple messages (one per chunk)
|
| 220 |
+
client.send_batch(
|
| 221 |
+
model=settings.CLAUDE_MODEL,
|
| 222 |
+
prompts=prompts,
|
| 223 |
+
images_base64=images_base64_list,
|
| 224 |
+
image_docids=image_docids_list,
|
| 225 |
+
batch_data_directory=batch_dir,
|
| 226 |
+
max_tokens=16384
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
print(f"[Job {request_id}] ⏳ Batch created with {num_prompts} tasks, awaiting processing...")
|
| 230 |
+
|
| 231 |
+
# ==================== Step 4: Poll Batch Until Complete ====================
|
| 232 |
+
client.await_batches(
|
| 233 |
+
batch_data_directory=batch_dir,
|
| 234 |
+
message_data_directory=message_dir,
|
| 235 |
+
sleep_seconds_between_batch=2,
|
| 236 |
+
sleep_seconds_iteration=settings.BATCH_POLL_INTERVAL
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
print(f"[Job {request_id}] ✓ Batch complete")
|
| 240 |
+
|
| 241 |
+
# ==================== Step 5: Read Batch Results ====================
|
| 242 |
+
message_files = list(message_dir.glob("*.json"))
|
| 243 |
+
|
| 244 |
+
if not message_files:
|
| 245 |
+
raise RuntimeError("No message results found after batch completion")
|
| 246 |
+
|
| 247 |
+
html_documents = []
|
| 248 |
+
for msg_file in message_files:
|
| 249 |
+
try:
|
| 250 |
+
message_data = json.loads(msg_file.read_text())
|
| 251 |
+
if message_data.get('result_type') == 'succeeded':
|
| 252 |
+
llm_response = message_data['response']
|
| 253 |
+
docs = extract_html_documents_from_response(llm_response)
|
| 254 |
+
html_documents.extend(docs)
|
| 255 |
+
log_verbose(f" ✓ Extracted {len(docs)} documents from task {msg_file.stem}")
|
| 256 |
+
else:
|
| 257 |
+
error_msg = message_data.get('error', 'Unknown error')
|
| 258 |
+
print(f"[Job {request_id}] ⚠ Task {msg_file.stem} failed: {error_msg}")
|
| 259 |
+
except Exception as e:
|
| 260 |
+
print(f"[Job {request_id}] ⚠ Error reading message result {msg_file.name}: {e}")
|
| 261 |
+
|
| 262 |
+
if not html_documents:
|
| 263 |
+
raise RuntimeError("No valid HTML documents found in any batch results")
|
| 264 |
+
|
| 265 |
+
print(f"[Job {request_id}] ✓ Combined total of {len(html_documents)} documents from all tasks")
|
| 266 |
+
|
| 267 |
+
# ==================== Update Status: Generating ====================
|
| 268 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "generating"))
|
| 269 |
+
print(f"[Job {request_id}] Status: generating (processing documents)")
|
| 270 |
+
|
| 271 |
+
# ==================== Step 7: Download Assets from Supabase ====================
|
| 272 |
+
assets_temp_dir = None
|
| 273 |
+
try:
|
| 274 |
+
assets_path = f"{user_id}/{request_id}/assets"
|
| 275 |
+
files = supabase_client.list_files("doc_storage", assets_path)
|
| 276 |
+
|
| 277 |
+
# Filter out directories
|
| 278 |
+
asset_files = [f for f in files if f.get('id') is not None]
|
| 279 |
+
|
| 280 |
+
if asset_files:
|
| 281 |
+
assets_temp_dir = pathlib.Path(tempfile.mkdtemp())
|
| 282 |
+
print(f"[Job {request_id}] Found {len(asset_files)} assets in storage, downloading...")
|
| 283 |
+
|
| 284 |
+
for file_info in asset_files:
|
| 285 |
+
file_name = file_info['name']
|
| 286 |
+
try:
|
| 287 |
+
file_content = supabase_client.download_file("doc_storage", f"{assets_path}/{file_name}")
|
| 288 |
+
with open(assets_temp_dir / file_name, 'wb') as f:
|
| 289 |
+
f.write(file_content)
|
| 290 |
+
log_verbose(f" ✓ Downloaded {file_name}")
|
| 291 |
+
except Exception as download_err:
|
| 292 |
+
print(f" ⚠ Failed to download {file_name}: {download_err}")
|
| 293 |
+
else:
|
| 294 |
+
log_verbose(f"[Job {request_id}] No assets found in {assets_path}")
|
| 295 |
+
except Exception as e:
|
| 296 |
+
print(f"[Job {request_id}] ⚠ Asset check/download failed: {e}")
|
| 297 |
+
|
| 298 |
+
# ==================== Step 8: Process Each Document ====================
|
| 299 |
+
pdf_files = []
|
| 300 |
+
metadata = []
|
| 301 |
+
|
| 302 |
+
for idx, html in enumerate(html_documents):
|
| 303 |
+
try:
|
| 304 |
+
doc_id = f"document_{idx + 1}"
|
| 305 |
+
log_verbose(f"[Job {request_id}] Processing document {idx + 1}/{len(html_documents)}")
|
| 306 |
+
|
| 307 |
+
# Initialize original_pdf_path
|
| 308 |
+
original_pdf_path = None
|
| 309 |
+
|
| 310 |
+
# Validate HTML
|
| 311 |
+
is_valid, error_msg = validate_html_structure(html)
|
| 312 |
+
if not is_valid:
|
| 313 |
+
print(f"[Job {request_id}] Document {idx + 1} HTML validation failed: {error_msg}")
|
| 314 |
+
continue
|
| 315 |
+
|
| 316 |
+
# Extract ground truth and CSS
|
| 317 |
+
gt, html_clean = extract_ground_truth(html)
|
| 318 |
+
css, _ = extract_css_from_html(html_clean)
|
| 319 |
+
|
| 320 |
+
# Render to PDF
|
| 321 |
+
pdf_path = tmp_path / f"{doc_id}.pdf"
|
| 322 |
+
pdf_path, width_mm, height_mm, geometries = await render_html_to_pdf(
|
| 323 |
+
html=html_clean,
|
| 324 |
+
output_pdf_path=pdf_path
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Track original PDF
|
| 328 |
+
original_pdf_path = pdf_path
|
| 329 |
+
|
| 330 |
+
# Validate PDF
|
| 331 |
+
is_valid, error_msg = validate_pdf(pdf_path)
|
| 332 |
+
if not is_valid:
|
| 333 |
+
print(f"[Job {request_id}] Document {idx + 1} PDF validation failed: {error_msg}")
|
| 334 |
+
continue
|
| 335 |
+
|
| 336 |
+
# Extract bounding boxes
|
| 337 |
+
bboxes_raw = extract_bboxes_from_rendered_pdf(pdf_path)
|
| 338 |
+
|
| 339 |
+
# Validate bboxes
|
| 340 |
+
is_valid, error_msg = validate_bboxes(bboxes_raw, min_bbox_count=1)
|
| 341 |
+
if not is_valid:
|
| 342 |
+
print(f"[Job {request_id}] Document {idx + 1} BBox validation warning: {error_msg}")
|
| 343 |
+
|
| 344 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: Extracted {len(bboxes_raw)} bboxes")
|
| 345 |
+
|
| 346 |
+
# Process Stage 3 (Handwriting & Visual Elements) if enabled
|
| 347 |
+
final_image_b64 = None
|
| 348 |
+
handwriting_regions = []
|
| 349 |
+
visual_elements = []
|
| 350 |
+
handwriting_images = {}
|
| 351 |
+
visual_element_images = {}
|
| 352 |
+
ocr_results = None
|
| 353 |
+
pdf_with_handwriting_path = None
|
| 354 |
+
pdf_final_path = None
|
| 355 |
+
|
| 356 |
+
if prompt_params.get('enable_handwriting') or prompt_params.get('enable_visual_elements'):
|
| 357 |
+
# Update status: Handwriting
|
| 358 |
+
if prompt_params.get('enable_handwriting'):
|
| 359 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "handwriting"))
|
| 360 |
+
log_verbose(f"[Job {request_id}] Status: handwriting (generating handwritten text)")
|
| 361 |
+
|
| 362 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: Processing handwriting/visual elements...")
|
| 363 |
+
|
| 364 |
+
try:
|
| 365 |
+
final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path = await process_stage3_complete(
|
| 366 |
+
pdf_path=pdf_path,
|
| 367 |
+
geometries=geometries,
|
| 368 |
+
ground_truth=gt,
|
| 369 |
+
bboxes_raw=bboxes_raw,
|
| 370 |
+
page_width_mm=width_mm,
|
| 371 |
+
page_height_mm=height_mm,
|
| 372 |
+
enable_handwriting=prompt_params.get('enable_handwriting', False),
|
| 373 |
+
handwriting_ratio=prompt_params.get('handwriting_ratio', 0.3),
|
| 374 |
+
enable_visual_elements=prompt_params.get('enable_visual_elements', False),
|
| 375 |
+
visual_element_types=prompt_params.get('visual_element_types', []),
|
| 376 |
+
seed=prompt_params.get('seed'),
|
| 377 |
+
assets_dir=assets_temp_dir,
|
| 378 |
+
barcode_number=prompt_params.get('barcode_number')
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Use final PDF if both modifications applied, otherwise use handwriting PDF
|
| 382 |
+
if pdf_final_path and pdf_final_path.exists():
|
| 383 |
+
pdf_path = pdf_final_path
|
| 384 |
+
elif pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
|
| 385 |
+
pdf_path = pdf_with_handwriting_path
|
| 386 |
+
|
| 387 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: {len(handwriting_regions)} handwriting, {len(visual_elements)} visual elements")
|
| 388 |
+
|
| 389 |
+
except Exception as e:
|
| 390 |
+
print(f"[Job {request_id}] Document {idx + 1}: Stage 3 failed: {str(e)}")
|
| 391 |
+
|
| 392 |
+
# Process Stage 4/5 (OCR) if needed
|
| 393 |
+
if prompt_params.get('enable_ocr'):
|
| 394 |
+
# Update status: OCR
|
| 395 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "ocr"))
|
| 396 |
+
log_verbose(f"[Job {request_id}] Status: ocr (running OCR on documents)")
|
| 397 |
+
|
| 398 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: Processing OCR...")
|
| 399 |
+
|
| 400 |
+
try:
|
| 401 |
+
stage4_image, ocr_results = await process_stage4_ocr(
|
| 402 |
+
pdf_path=pdf_path,
|
| 403 |
+
enable_ocr=True,
|
| 404 |
+
dpi=settings.OCR_DPI
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
if ocr_results:
|
| 408 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: OCR complete - {len(ocr_results.get('words', []))} words")
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
print(f"[Job {request_id}] Document {idx + 1}: OCR failed: {str(e)}")
|
| 412 |
+
|
| 413 |
+
# Process Stage 5 (Dataset packaging) if needed
|
| 414 |
+
stage5_results = {}
|
| 415 |
+
if any([
|
| 416 |
+
prompt_params.get('enable_bbox_normalization'),
|
| 417 |
+
prompt_params.get('enable_gt_verification'),
|
| 418 |
+
prompt_params.get('enable_analysis'),
|
| 419 |
+
prompt_params.get('enable_debug_visualization')
|
| 420 |
+
]):
|
| 421 |
+
# Update status: Validation (if GT verification enabled)
|
| 422 |
+
if prompt_params.get('enable_gt_verification'):
|
| 423 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "validation"))
|
| 424 |
+
log_verbose(f"[Job {request_id}] Status: validation (validating ground truth)")
|
| 425 |
+
|
| 426 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: Processing dataset packaging...")
|
| 427 |
+
|
| 428 |
+
try:
|
| 429 |
+
stage5_results = await process_stage5_complete(
|
| 430 |
+
document_id=doc_id,
|
| 431 |
+
pdf_path=pdf_path,
|
| 432 |
+
image_base64=final_image_b64,
|
| 433 |
+
ocr_results=ocr_results,
|
| 434 |
+
ground_truth=gt,
|
| 435 |
+
has_handwriting=prompt_params.get('enable_handwriting', False),
|
| 436 |
+
has_visual_elements=prompt_params.get('enable_visual_elements', False),
|
| 437 |
+
layout_elements=visual_elements,
|
| 438 |
+
enable_bbox_normalization=prompt_params.get('enable_bbox_normalization', False),
|
| 439 |
+
enable_gt_verification=prompt_params.get('enable_gt_verification', False),
|
| 440 |
+
enable_analysis=prompt_params.get('enable_analysis', False),
|
| 441 |
+
enable_debug_visualization=prompt_params.get('enable_debug_visualization', False)
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
except Exception as e:
|
| 445 |
+
print(f"[Job {request_id}] Document {idx + 1}: Stage 5 failed: {str(e)}")
|
| 446 |
+
|
| 447 |
+
# Track PDFs for metadata
|
| 448 |
+
if original_pdf_path and pdf_path != original_pdf_path:
|
| 449 |
+
pdf_files.append(original_pdf_path)
|
| 450 |
+
pdf_files.append(pdf_path)
|
| 451 |
+
else:
|
| 452 |
+
pdf_files.append(pdf_path)
|
| 453 |
+
|
| 454 |
+
# Extract bbox_pdf (word + char) from original PDF (ground truth positions)
|
| 455 |
+
from .utils import extract_all_bboxes_from_pdf, extract_raw_annotations_from_geometries
|
| 456 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Extracting bbox_pdf (word + char level) from original PDF...")
|
| 457 |
+
|
| 458 |
+
try:
|
| 459 |
+
bboxes_pdf = extract_all_bboxes_from_pdf(original_pdf_path if original_pdf_path else pdf_path)
|
| 460 |
+
bbox_pdf_word = bboxes_pdf.get('word', [])
|
| 461 |
+
bbox_pdf_char = bboxes_pdf.get('char', [])
|
| 462 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: ✓ Extracted {len(bbox_pdf_word)} word bboxes, {len(bbox_pdf_char)} char bboxes from PDF")
|
| 463 |
+
except Exception as e:
|
| 464 |
+
print(f"[Job {request_id}] Document {idx + 1}: ⚠ bbox_pdf extraction failed: {e}")
|
| 465 |
+
bbox_pdf_word = bboxes_raw # Fallback to raw bboxes
|
| 466 |
+
bbox_pdf_char = []
|
| 467 |
+
|
| 468 |
+
# Extract raw_annotations (layout boxes before normalization)
|
| 469 |
+
raw_annotations = None
|
| 470 |
+
if geometries:
|
| 471 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Extracting raw_annotations from geometries...")
|
| 472 |
+
try:
|
| 473 |
+
raw_annotations = extract_raw_annotations_from_geometries(geometries)
|
| 474 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: ✓ Extracted {len(raw_annotations)} layout annotations")
|
| 475 |
+
except Exception as e:
|
| 476 |
+
print(f"[Job {request_id}] Document {idx + 1}: ⚠ raw_annotations extraction failed: {e}")
|
| 477 |
+
|
| 478 |
+
# Decode final image to bytes
|
| 479 |
+
final_image_bytes = None
|
| 480 |
+
if final_image_b64:
|
| 481 |
+
import base64
|
| 482 |
+
final_image_bytes = base64.b64decode(final_image_b64)
|
| 483 |
+
|
| 484 |
+
# Decode debug visualization
|
| 485 |
+
debug_viz_bytes = None
|
| 486 |
+
if stage5_results.get('debug_visualization'):
|
| 487 |
+
import base64
|
| 488 |
+
debug_viz_dict = stage5_results['debug_visualization']
|
| 489 |
+
if debug_viz_dict and 'bbox_overlay_base64' in debug_viz_dict:
|
| 490 |
+
debug_viz_b64 = debug_viz_dict['bbox_overlay_base64']
|
| 491 |
+
debug_viz_bytes = base64.b64decode(debug_viz_b64)
|
| 492 |
+
|
| 493 |
+
# Prepare token mapping if tokens exist
|
| 494 |
+
output_detail = prompt_params.get('output_detail', 'minimal')
|
| 495 |
+
token_mapping_data = None
|
| 496 |
+
if output_detail in ["dataset", "complete"]:
|
| 497 |
+
if handwriting_images or visual_element_images:
|
| 498 |
+
from .utils import create_token_mapping_json
|
| 499 |
+
token_mapping_data = create_token_mapping_json(
|
| 500 |
+
handwriting_regions,
|
| 501 |
+
handwriting_images,
|
| 502 |
+
visual_elements,
|
| 503 |
+
visual_element_images
|
| 504 |
+
)
|
| 505 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Output detail '{output_detail}': Prepared {len(handwriting_images)} handwriting tokens, {len(visual_element_images)} visual elements")
|
| 506 |
+
|
| 507 |
+
# Extract bbox_final_word and bbox_final_segment (from OCR or PDF)
|
| 508 |
+
bbox_final_word = None
|
| 509 |
+
bbox_final_segment = None
|
| 510 |
+
if ocr_results and ocr_results.get('words'):
|
| 511 |
+
# Use OCR results as final bboxes
|
| 512 |
+
bbox_final_word = ocr_results.get('words', [])
|
| 513 |
+
bbox_final_segment = ocr_results.get('lines', [])
|
| 514 |
+
else:
|
| 515 |
+
# Fallback to PDF bboxes if no OCR
|
| 516 |
+
bbox_final_word = bbox_pdf_word
|
| 517 |
+
bbox_final_segment = [] # No line-level data without OCR
|
| 518 |
+
|
| 519 |
+
# Read PDF bytes for exporter
|
| 520 |
+
pdf_initial_bytes = original_pdf_path.read_bytes()
|
| 521 |
+
|
| 522 |
+
# Read modified PDFs if they exist
|
| 523 |
+
pdf_with_handwriting_bytes = None
|
| 524 |
+
pdf_final_bytes = None
|
| 525 |
+
pdf_with_visual_elements_bytes = None
|
| 526 |
+
|
| 527 |
+
if pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
|
| 528 |
+
pdf_with_handwriting_bytes = pdf_with_handwriting_path.read_bytes()
|
| 529 |
+
|
| 530 |
+
if pdf_final_path and pdf_final_path.exists():
|
| 531 |
+
pdf_final_bytes = pdf_final_path.read_bytes()
|
| 532 |
+
|
| 533 |
+
# Special case: if only visual elements (no handwriting), pdf_final is actually pdf_with_visual_elements
|
| 534 |
+
if pdf_final_bytes and not pdf_with_handwriting_bytes:
|
| 535 |
+
pdf_with_visual_elements_bytes = pdf_final_bytes
|
| 536 |
+
pdf_final_bytes = None
|
| 537 |
+
|
| 538 |
+
# Add document to exporter
|
| 539 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Adding document to dataset exporter...")
|
| 540 |
+
exporter.add_document(
|
| 541 |
+
document_id=doc_id,
|
| 542 |
+
html=html_clean,
|
| 543 |
+
css=css,
|
| 544 |
+
pdf_initial=pdf_initial_bytes,
|
| 545 |
+
pdf_with_handwriting=pdf_with_handwriting_bytes,
|
| 546 |
+
pdf_with_visual_elements=pdf_with_visual_elements_bytes,
|
| 547 |
+
pdf_final=pdf_final_bytes,
|
| 548 |
+
final_image=final_image_bytes,
|
| 549 |
+
ground_truth=gt,
|
| 550 |
+
raw_annotations=raw_annotations,
|
| 551 |
+
bboxes_pdf_word=bbox_pdf_word,
|
| 552 |
+
bboxes_pdf_char=bbox_pdf_char,
|
| 553 |
+
bboxes_final_word=bbox_final_word,
|
| 554 |
+
bboxes_final_segment=bbox_final_segment,
|
| 555 |
+
bboxes_normalized_word=stage5_results.get('normalized_bboxes_word'),
|
| 556 |
+
bboxes_normalized_segment=stage5_results.get('normalized_bboxes_segment'),
|
| 557 |
+
gt_verification=stage5_results.get('gt_verification'),
|
| 558 |
+
token_mapping=token_mapping_data,
|
| 559 |
+
handwriting_regions=handwriting_regions,
|
| 560 |
+
handwriting_images=handwriting_images,
|
| 561 |
+
visual_elements=visual_elements,
|
| 562 |
+
visual_element_images=visual_element_images,
|
| 563 |
+
layout_elements=visual_elements,
|
| 564 |
+
geometries=geometries,
|
| 565 |
+
ocr_results=ocr_results,
|
| 566 |
+
analysis_stats=stage5_results.get('analysis_stats'),
|
| 567 |
+
debug_visualization=debug_viz_bytes
|
| 568 |
+
)
|
| 569 |
+
log_verbose(f"[Job {request_id}] Document {idx + 1}: ✓ Document {doc_id} added to dataset")
|
| 570 |
+
|
| 571 |
+
# Store comprehensive metadata (matching /generate/pdf format)
|
| 572 |
+
metadata.append({
|
| 573 |
+
"document_id": doc_id,
|
| 574 |
+
"filename": f"{doc_id}.pdf",
|
| 575 |
+
"bboxes": bboxes_raw,
|
| 576 |
+
"ground_truth": gt,
|
| 577 |
+
"geometries": geometries,
|
| 578 |
+
"page_width_mm": width_mm,
|
| 579 |
+
"page_height_mm": height_mm,
|
| 580 |
+
"handwriting_regions": handwriting_regions,
|
| 581 |
+
"visual_elements": visual_elements,
|
| 582 |
+
"has_stage3_image": final_image_b64 is not None,
|
| 583 |
+
"ocr_results": ocr_results,
|
| 584 |
+
# Stage 5 results
|
| 585 |
+
"normalized_bboxes_word": stage5_results.get('normalized_bboxes_word'),
|
| 586 |
+
"normalized_bboxes_segment": stage5_results.get('normalized_bboxes_segment'),
|
| 587 |
+
"gt_verification": stage5_results.get('gt_verification'),
|
| 588 |
+
"analysis_stats": stage5_results.get('analysis_stats'),
|
| 589 |
+
"debug_visualization_available": stage5_results.get('debug_visualization') is not None
|
| 590 |
+
})
|
| 591 |
+
|
| 592 |
+
except Exception as e:
|
| 593 |
+
print(f"[Job {request_id}] Error processing document {idx + 1}: {str(e)}")
|
| 594 |
+
traceback.print_exc()
|
| 595 |
+
continue
|
| 596 |
+
|
| 597 |
+
if not pdf_files:
|
| 598 |
+
raise RuntimeError("Failed to process any documents")
|
| 599 |
+
|
| 600 |
+
log_verbose(f"[Job {request_id}] Processed {len(pdf_files)} PDF files")
|
| 601 |
+
|
| 602 |
+
# ==================== Step 8: Finalize Dataset & Create ZIP ====================
|
| 603 |
+
log_verbose(f"[Job {request_id}] 📦 Finalizing dataset export...")
|
| 604 |
+
exporter.finalize(
|
| 605 |
+
request_id=request_id,
|
| 606 |
+
user_id=user_id,
|
| 607 |
+
prompt_params=prompt_params,
|
| 608 |
+
api_mode="async"
|
| 609 |
+
)
|
| 610 |
+
log_verbose(f"[Job {request_id}] ✓ Dataset structure finalized at {exporter.base_path}")
|
| 611 |
+
|
| 612 |
+
# ==================== Update Status: Zipping ====================
|
| 613 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "zipping"))
|
| 614 |
+
print(f"[Job {request_id}] Status: zipping (creating ZIP archive)")
|
| 615 |
+
|
| 616 |
+
# Create ZIP from organized dataset
|
| 617 |
+
log_verbose(f"[Job {request_id}] 📦 Creating ZIP archive from dataset...")
|
| 618 |
+
zip_path = tmp_path / f"docgenie_{request_id}.zip"
|
| 619 |
+
|
| 620 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
| 621 |
+
# Add all files from exporter.base_path
|
| 622 |
+
for file_path in exporter.base_path.rglob('*'):
|
| 623 |
+
if file_path.is_file():
|
| 624 |
+
arcname = file_path.relative_to(exporter.base_path.parent)
|
| 625 |
+
zip_file.write(file_path, arcname)
|
| 626 |
+
|
| 627 |
+
zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
|
| 628 |
+
log_verbose(f"[Job {request_id}] ✓ ZIP created: {zip_size_mb:.2f} MB")
|
| 629 |
+
|
| 630 |
+
# ==================== Update Status: Uploading ====================
|
| 631 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "uploading"))
|
| 632 |
+
print(f"[Job {request_id}] Status: uploading (uploading to Google Drive)")
|
| 633 |
+
|
| 634 |
+
# ==================== Step 9: Upload to Google Drive ====================
|
| 635 |
+
print(f"[Job {request_id}] ⬆️ Uploading to Google Drive...")
|
| 636 |
+
|
| 637 |
+
google_drive_url = None
|
| 638 |
+
gdrive_failed = False
|
| 639 |
+
# Check if Google Drive token provided
|
| 640 |
+
if not google_drive_token:
|
| 641 |
+
print(f"[Job {request_id}] No Google Drive token provided. Skipping Google Drive upload.")
|
| 642 |
+
else:
|
| 643 |
+
try:
|
| 644 |
+
drive_client = GoogleDriveClient(
|
| 645 |
+
access_token=google_drive_token,
|
| 646 |
+
refresh_token=google_drive_refresh_token
|
| 647 |
+
)
|
| 648 |
+
google_drive_url = drive_client.upload_file(
|
| 649 |
+
file_path=zip_path,
|
| 650 |
+
filename=f"docgenie_{request_id}.zip",
|
| 651 |
+
folder_name=settings.GOOGLE_DRIVE_FOLDER_NAME
|
| 652 |
+
)
|
| 653 |
+
|
| 654 |
+
print(f"[Job {request_id}] ✓ Uploaded to Google Drive: {google_drive_url}")
|
| 655 |
+
|
| 656 |
+
except Exception as e:
|
| 657 |
+
print(f"[Job {request_id}] Google Drive upload failed: {str(e)}")
|
| 658 |
+
gdrive_failed = True
|
| 659 |
+
# Do not raise an error, just continue so we can still save to Supabase
|
| 660 |
+
|
| 661 |
+
# ==================== Step 10: Store Results in Supabase ====================
|
| 662 |
+
log_verbose(f"[Job {request_id}] Saving results to Supabase...")
|
| 663 |
+
log_verbose(f"[Job {request_id}] URL: {google_drive_url}")
|
| 664 |
+
|
| 665 |
+
# Upload ZIP to Supabase
|
| 666 |
+
zip_url = None
|
| 667 |
+
try:
|
| 668 |
+
zip_storage_path = f"{user_id}/{request_id}/generated/docgenie_{request_id}.zip"
|
| 669 |
+
supabase_client.upload_to_storage("doc_storage", zip_storage_path, zip_path.read_bytes(), "application/zip")
|
| 670 |
+
zip_url = supabase_client.get_public_url("doc_storage", zip_storage_path)
|
| 671 |
+
print(f"[Job {request_id}] ✓ Uploaded ZIP to Supabase: {zip_url}")
|
| 672 |
+
except Exception as e:
|
| 673 |
+
print(f"[Job {request_id}] ⚠ Supabase ZIP upload failed: {e}")
|
| 674 |
+
|
| 675 |
+
# ==================== Step 11: Upload Individual Documents to Supabase ====================
|
| 676 |
+
print(f"[Job {request_id}] Uploading individual documents to Supabase...")
|
| 677 |
+
for idx, doc_data in enumerate(metadata):
|
| 678 |
+
doc_id = doc_data["document_id"]
|
| 679 |
+
try:
|
| 680 |
+
# Determine paths (matching sync endpoint structure)
|
| 681 |
+
doc_storage_path = f"{user_id}/{request_id}/generated/{idx}_doc.pdf"
|
| 682 |
+
gt_storage_path = f"{user_id}/{request_id}/generated/{idx}_gt.json"
|
| 683 |
+
src_storage_path = f"{user_id}/{request_id}/generated/{idx}_src.html"
|
| 684 |
+
bbox_storage_path = f"{user_id}/{request_id}/generated/{idx}_bbox.json"
|
| 685 |
+
|
| 686 |
+
# Find files on disk
|
| 687 |
+
doc_path = exporter.pdf_final_dir / f"{doc_id}.pdf"
|
| 688 |
+
if not doc_path.exists():
|
| 689 |
+
doc_path = exporter.pdf_initial_dir / f"{doc_id}.pdf"
|
| 690 |
+
|
| 691 |
+
gt_path = exporter.gt_dir / f"{doc_id}.json"
|
| 692 |
+
src_path = exporter.html_dir / f"{doc_id}.html"
|
| 693 |
+
bbox_path = exporter.bbox_pdf_word_dir / f"{doc_id}.json"
|
| 694 |
+
|
| 695 |
+
# Upload PDF
|
| 696 |
+
if doc_path.exists():
|
| 697 |
+
supabase_client.upload_to_storage("doc_storage", doc_storage_path, doc_path.read_bytes(), "application/pdf")
|
| 698 |
+
|
| 699 |
+
# Upload Ground Truth
|
| 700 |
+
if gt_path.exists():
|
| 701 |
+
supabase_client.upload_to_storage("doc_storage", gt_storage_path, gt_path.read_bytes(), "application/json")
|
| 702 |
+
|
| 703 |
+
# Upload HTML Source
|
| 704 |
+
if src_path.exists():
|
| 705 |
+
supabase_client.upload_to_storage("doc_storage", src_storage_path, src_path.read_bytes(), "text/html")
|
| 706 |
+
|
| 707 |
+
# Upload Bounding Boxes
|
| 708 |
+
if bbox_path.exists():
|
| 709 |
+
supabase_client.upload_to_storage("doc_storage", bbox_storage_path, bbox_path.read_bytes(), "application/json")
|
| 710 |
+
|
| 711 |
+
# Upload visual element images if available (parity with sync)
|
| 712 |
+
if doc_data.get("visual_elements") and doc_data.get("visual_element_images"):
|
| 713 |
+
for ve_id, img_b64 in doc_data["visual_element_images"].items():
|
| 714 |
+
ve_storage_path = f"{user_id}/{request_id}/generated/{idx}_ve_{ve_id}.png"
|
| 715 |
+
try:
|
| 716 |
+
img_bytes = base64.b64decode(img_b64)
|
| 717 |
+
supabase_client.upload_to_storage("doc_storage", ve_storage_path, img_bytes, "image/png")
|
| 718 |
+
except Exception as ve_err:
|
| 719 |
+
print(f" ⚠ Failed to upload visual element {ve_id}: {ve_err}")
|
| 720 |
+
|
| 721 |
+
# Create record in database (parity with sync)
|
| 722 |
+
retry_on_network_error(lambda: supabase_client.create_generated_document(
|
| 723 |
+
request_id=request_id,
|
| 724 |
+
file_url=supabase_client.get_public_url("doc_storage", doc_storage_path),
|
| 725 |
+
file_type="application/pdf",
|
| 726 |
+
page_count=1,
|
| 727 |
+
model_version=settings.LLM_MODEL,
|
| 728 |
+
doc_index=idx,
|
| 729 |
+
doc_storage_path=doc_storage_path,
|
| 730 |
+
gt_storage_path=gt_storage_path,
|
| 731 |
+
html_storage_path=src_storage_path,
|
| 732 |
+
bbox_storage_path=bbox_storage_path
|
| 733 |
+
))
|
| 734 |
+
|
| 735 |
+
log_verbose(f" ✓ Uploaded document {idx+1} assets and created DB record")
|
| 736 |
+
except Exception as upload_err:
|
| 737 |
+
print(f" ⚠ Failed to upload individual assets for document {idx+1}: {upload_err}")
|
| 738 |
+
|
| 739 |
+
# Create generated document record
|
| 740 |
+
retry_on_network_error(lambda: supabase_client.create_generated_document(
|
| 741 |
+
request_id=request_id,
|
| 742 |
+
file_url=google_drive_url,
|
| 743 |
+
file_type="application/zip",
|
| 744 |
+
page_count=len(metadata), # Using document count as page_count
|
| 745 |
+
model_version=settings.LLM_MODEL,
|
| 746 |
+
zip_url=zip_url
|
| 747 |
+
))
|
| 748 |
+
|
| 749 |
+
# Update request status
|
| 750 |
+
status = "completed_gdrive_failed" if gdrive_failed else "completed"
|
| 751 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(request_id, status))
|
| 752 |
+
|
| 753 |
+
# Log analytics
|
| 754 |
+
retry_on_network_error(lambda: supabase_client.log_analytics_event(
|
| 755 |
+
user_id=user_id,
|
| 756 |
+
event_type="document_generation_completed",
|
| 757 |
+
entity_id=request_id
|
| 758 |
+
))
|
| 759 |
+
|
| 760 |
+
print(f"[Job {request_id}] ✅ Job completed successfully!")
|
| 761 |
+
|
| 762 |
+
except Exception as e:
|
| 763 |
+
# Update status to failed with error message
|
| 764 |
+
error_message = f"{type(e).__name__}: {str(e)}"
|
| 765 |
+
print(f"[Job {request_id}] ❌ Job failed: {error_message}")
|
| 766 |
+
traceback.print_exc()
|
| 767 |
+
|
| 768 |
+
retry_on_network_error(lambda: supabase_client.update_request_status(
|
| 769 |
+
request_id=request_id,
|
| 770 |
+
status="failed",
|
| 771 |
+
error_message=error_message
|
| 772 |
+
))
|
| 773 |
+
|
| 774 |
+
# Log analytics
|
| 775 |
+
retry_on_network_error(lambda: supabase_client.log_analytics_event(
|
| 776 |
+
user_id=user_id,
|
| 777 |
+
event_type="document_generation_failed",
|
| 778 |
+
entity_id=request_id
|
| 779 |
+
))
|
| 780 |
+
|
| 781 |
+
raise # Re-raise so RQ marks job as failed
|
| 782 |
+
finally:
|
| 783 |
+
# Clean up assets directory if it exists
|
| 784 |
+
if 'assets_temp_dir' in locals() and assets_temp_dir and assets_temp_dir.exists():
|
| 785 |
+
try:
|
| 786 |
+
shutil.rmtree(assets_temp_dir, ignore_errors=True)
|
| 787 |
+
print(f"[Job {request_id}] ✓ Cleaned up assets directory {assets_temp_dir}")
|
| 788 |
+
except:
|
| 789 |
+
pass
|
| 790 |
+
|
| 791 |
+
|
| 792 |
+
def process_document_generation_job(request_id: str, request_data: Dict[str, Any]):
|
| 793 |
+
"""
|
| 794 |
+
Synchronous wrapper for RQ - calls the async function with asyncio.run().
|
| 795 |
+
|
| 796 |
+
This is the function that RQ worker calls. It runs the async version using asyncio.
|
| 797 |
+
"""
|
| 798 |
+
print(f"{'='*60}")
|
| 799 |
+
print(f"🎯 Worker picked up job: {request_id}")
|
| 800 |
+
print(f" User ID: {request_data.get('user_id', 'N/A')}")
|
| 801 |
+
print(f" Num documents: {request_data.get('prompt_params', {}).get('num_solutions', 'N/A')}")
|
| 802 |
+
print(f"{'='*60}")
|
| 803 |
+
|
| 804 |
+
return asyncio.run(process_document_generation_job_async(request_id, request_data))
|
data/docvqa_hw/handschrift_mit_qid.jsonl
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"name": "fykp0227_9.pdf", "question_ids": [63039]}
|
| 2 |
+
{"name": "fypn0000_2.pdf", "question_ids": [52698, 52700, 52703]}
|
| 3 |
+
{"name": "hkdv0228_4.pdf", "question_ids": [61298]}
|
| 4 |
+
{"name": "hmmg0227_13.pdf", "question_ids": [63575, 63578]}
|
| 5 |
+
{"name": "hnnp0227_33.pdf", "question_ids": [23169]}
|
| 6 |
+
{"name": "hnnp0227_35.pdf", "question_ids": [21316, 21317]}
|
| 7 |
+
{"name": "hznv0228_1.pdf", "question_ids": [2325, 2327]}
|
| 8 |
+
{"name": "jmmg0227_1.pdf", "question_ids": [63696]}
|
| 9 |
+
{"name": "jqyg0227_1.pdf", "question_ids": [63487]}
|
| 10 |
+
{"name": "jrgn0226_1.pdf", "question_ids": [44904]}
|
| 11 |
+
{"name": "ljgg0227_2.pdf", "question_ids": [61199]}
|
| 12 |
+
{"name": "lkdv0228_13.pdf", "question_ids": [61536, 61537, 61538, 61539]}
|
| 13 |
+
{"name": "lkdv0228_28.pdf", "question_ids": [61527, 61528, 61529, 61530]}
|
| 14 |
+
{"name": "lrgn0226_1.pdf", "question_ids": [43050]}
|
| 15 |
+
{"name": "lyph0227_1.pdf", "question_ids": [5280]}
|
| 16 |
+
{"name": "mmxd0227_1.pdf", "question_ids": [38123, 38163]}
|
| 17 |
+
{"name": "mzyc0227_2.pdf", "question_ids": [63782]}
|
| 18 |
+
{"name": "njpf0227_2.pdf", "question_ids": [61140, 61143, 61144, 61145]}
|
| 19 |
+
{"name": "nmyg0227_1.pdf", "question_ids": [62372, 62373]}
|
| 20 |
+
{"name": "npnm0020_1.pdf", "question_ids": [53668, 53669]}
|
| 21 |
+
{"name": "pnbv0228_1.pdf", "question_ids": [50370]}
|
| 22 |
+
{"name": "rkmd0217_2.pdf", "question_ids": [64390]}
|
| 23 |
+
{"name": "shwg0227_1.pdf", "question_ids": [62010]}
|
| 24 |
+
{"name": "tlgw0228_1.pdf", "question_ids": [50193]}
|
| 25 |
+
{"name": "txpn0095_17.pdf", "question_ids": [52200, 52202, 52204]}
|
| 26 |
+
{"name": "xnbl0037_7.pdf", "question_ids": [575]}
|
| 27 |
+
{"name": "xthh0077_40.pdf", "question_ids": [55068, 55069, 55073, 55076, 55077]}
|
| 28 |
+
{"name": "xthh0077_41.pdf", "question_ids": [54938, 54941, 54943, 54945, 54946, 54948]}
|
| 29 |
+
{"name": "yjdv0228_13.pdf", "question_ids": [61895, 61902]}
|
| 30 |
+
{"name": "yqgl0228_1.pdf", "question_ids": [209]}
|
| 31 |
+
{"name": "zhjl0226_1.pdf", "question_ids": [44927]}
|
| 32 |
+
{"name": "znmf0227_4.pdf", "question_ids": [62529, 62530, 62532]}
|
| 33 |
+
{"name": "yhxd0227_2.pdf", "question_ids": [62644]}
|
| 34 |
+
{"name": "ynbm0227_3.pdf", "question_ids": [56931]}
|
| 35 |
+
{"name": "ffmm0020_1.pdf", "question_ids": [53921, 53924]}
|
| 36 |
+
{"name": "fkmj0226_7.pdf", "question_ids": [52474, 52475, 52476]}
|
| 37 |
+
{"name": "fnkp0227_31.pdf", "question_ids": [63098, 63100, 63102]}
|
| 38 |
+
{"name": "gtmj0226_7.pdf", "question_ids": [5722, 5724]}
|
| 39 |
+
{"name": "gxyd0217_2.pdf", "question_ids": [35331, 35332, 35333, 35335, 35339]}
|
| 40 |
+
{"name": "hhhh0224_10.pdf", "question_ids": [9769, 9770, 9771, 9772]}
|
| 41 |
+
{"name": "hnnp0227_28.pdf", "question_ids": [22936, 22941, 22945, 22995]}
|
| 42 |
+
{"name": "hnnp0227_80.pdf", "question_ids": [23110, 23113, 23116, 23119]}
|
| 43 |
+
{"name": "hqfh0224_17.pdf", "question_ids": [5902, 5904, 5907]}
|
| 44 |
+
{"name": "htwc0228_1.pdf", "question_ids": [62864, 62865]}
|
| 45 |
+
{"name": "jggn0226_39.pdf", "question_ids": [45089, 45094, 45095, 45097, 45098]}
|
| 46 |
+
{"name": "kfkm0081_1.pdf", "question_ids": [56535, 56537, 56539, 56540]}
|
| 47 |
+
{"name": "kggn0226_20.pdf", "question_ids": [46467, 46471, 46475, 46482]}
|
| 48 |
+
{"name": "kggn0226_26.pdf", "question_ids": [43354, 43357, 43376]}
|
| 49 |
+
{"name": "kjnd0004_1.pdf", "question_ids": [7042, 7044, 7045, 7048, 7050]}
|
| 50 |
+
{"name": "kkkp0227_21.pdf", "question_ids": [56382, 56383, 56384, 56385]}
|
| 51 |
+
{"name": "klfw0081_1.pdf", "question_ids": [56916, 56920]}
|
| 52 |
+
{"name": "kyvw0217_1.pdf", "question_ids": [16955, 16956, 16958]}
|
| 53 |
+
{"name": "lfgn0226_29.pdf", "question_ids": [43172, 43185]}
|
| 54 |
+
{"name": "lmyc0227_2.pdf", "question_ids": [40740, 40743, 40745, 40749, 40751, 40755, 40758, 40762, 40764]}
|
| 55 |
+
{"name": "lnnp0227_1.pdf", "question_ids": [59669, 59670, 59671]}
|
| 56 |
+
{"name": "lnwg0227_36.pdf", "question_ids": [31500, 31501, 31502, 31503, 31504]}
|
| 57 |
+
{"name": "lsww0228_14.pdf", "question_ids": [50277, 50281, 50285, 50288, 50291, 50407]}
|
| 58 |
+
{"name": "lxkp0227_5.pdf", "question_ids": [57793, 57794]}
|
| 59 |
+
{"name": "mmkp0227_5.pdf", "question_ids": [62980, 62982]}
|
| 60 |
+
{"name": "nfpw0224_1.pdf", "question_ids": [5571, 5573]}
|
| 61 |
+
{"name": "nhyk0226_1.pdf", "question_ids": [60062, 60064, 60065, 60066]}
|
| 62 |
+
{"name": "nngv0228_2.pdf", "question_ids": [59573]}
|
| 63 |
+
{"name": "pfpw0224_10.pdf", "question_ids": [4950, 4951]}
|
| 64 |
+
{"name": "ppjb0228_7.pdf", "question_ids": [49295, 49301, 49306, 49307, 49312, 49316]}
|
| 65 |
+
{"name": "qllg0023_3.pdf", "question_ids": [54893, 54898]}
|
| 66 |
+
{"name": "rpwx0225_9.pdf", "question_ids": [4763, 4764, 4765, 4766]}
|
| 67 |
+
{"name": "rydb0228_2.pdf", "question_ids": [59517, 59519, 59521, 59523, 59525]}
|
| 68 |
+
{"name": "rzyw0224_1.pdf", "question_ids": [5682, 5685, 5688, 5691, 5697]}
|
| 69 |
+
{"name": "sjkg0227_2.pdf", "question_ids": [61152, 61153, 61154, 61155]}
|
| 70 |
+
{"name": "thwm0227_2.pdf", "question_ids": [59272]}
|
| 71 |
+
{"name": "tpwx0225_9.pdf", "question_ids": [5572, 5574, 5577]}
|
| 72 |
+
{"name": "xngv0228_4.pdf", "question_ids": [59870, 59871, 59872]}
|
| 73 |
+
{"name": "xxhd0227_6.pdf", "question_ids": [62613, 62615]}
|
| 74 |
+
{"name": "xygx0227_9.pdf", "question_ids": [21195, 21197, 21198, 21199]}
|
| 75 |
+
{"name": "xynd0004_1.pdf", "question_ids": [6688, 6689, 6690, 6691]}
|
| 76 |
+
{"name": "xyyv0228_1.pdf", "question_ids": [3036, 3037, 3038]}
|
| 77 |
+
{"name": "yjkg0227_2.pdf", "question_ids": [61148, 61150, 61151]}
|
| 78 |
+
{"name": "ylfh0078_1.pdf", "question_ids": [56262, 56263, 56264, 56265]}
|
| 79 |
+
{"name": "ylml0226_1.pdf", "question_ids": [63790, 63792, 63793]}
|
| 80 |
+
{"name": "ymkm0227_5.pdf", "question_ids": [59275, 59276, 59277, 59278]}
|
| 81 |
+
{"name": "ymkp0227_10.pdf", "question_ids": [61513, 61514, 61515, 61516, 61517, 61518, 61520, 61521]}
|
| 82 |
+
{"name": "zrww0228_4.pdf", "question_ids": [50908, 50909, 50910]}
|
| 83 |
+
{"name": "ztcn0020_11.pdf", "question_ids": [8056, 8057]}
|
data/docvqa_hw/zahlen_mit_qid.jsonl
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"name": "ffdh0224_1.pdf", "question_ids": [5844, 5851]}
|
| 2 |
+
{"name": "fgbd0079_5.pdf", "question_ids": [18595, 18598, 18602, 18606, 18609]}
|
| 3 |
+
{"name": "ghlw0228_3.pdf", "question_ids": [50847]}
|
| 4 |
+
{"name": "glfh0224_9.pdf", "question_ids": [5920, 5922, 5925, 5927, 5931]}
|
| 5 |
+
{"name": "hlmd0217_2.pdf", "question_ids": [38949]}
|
| 6 |
+
{"name": "hnpp0000_3.pdf", "question_ids": [5527]}
|
| 7 |
+
{"name": "hrcd0003_1.pdf", "question_ids": [6844]}
|
| 8 |
+
{"name": "jlmd0217_2.pdf", "question_ids": [64321]}
|
| 9 |
+
{"name": "jmmd0217_2.pdf", "question_ids": [37658]}
|
| 10 |
+
{"name": "jxwg0023_2.pdf", "question_ids": [53898]}
|
| 11 |
+
{"name": "kmmw0228_2.pdf", "question_ids": [56444]}
|
| 12 |
+
{"name": "kspw0224_2.pdf", "question_ids": [5980, 5987]}
|
| 13 |
+
{"name": "lsww0228_12.pdf", "question_ids": [52170, 52172, 52174]}
|
| 14 |
+
{"name": "prhm0227_3.pdf", "question_ids": [47948]}
|
| 15 |
+
{"name": "pzyy0078_7.pdf", "question_ids": [18897, 18899, 18901]}
|
| 16 |
+
{"name": "xfbd0003_1.pdf", "question_ids": [51512]}
|
| 17 |
+
{"name": "xrcy0227_48.pdf", "question_ids": [59635]}
|
| 18 |
+
{"name": "zqlp0000_10.pdf", "question_ids": [6093, 6097]}
|
| 19 |
+
{"name": "zqww0228_8.pdf", "question_ids": [50854, 50855, 50856]}
|
| 20 |
+
{"name": "zzyw0224_12.pdf", "question_ids": [4909, 4910, 4911]}
|
data/exports/DocVQA_clip_kmeans.png
ADDED
|
Git LFS Details
|
data/exports/DocVQA_layout_kmeans.png
ADDED
|
Git LFS Details
|
data/exports/DocVQA_text_kmeans.png
ADDED
|
Git LFS Details
|
data/models/handwriting/cached_vae/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKL",
|
| 3 |
+
"_diffusers_version": "0.35.2",
|
| 4 |
+
"_name_or_path": "stabilityai/sd-vae-ft-mse",
|
| 5 |
+
"act_fn": "silu",
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
128,
|
| 8 |
+
256,
|
| 9 |
+
512,
|
| 10 |
+
512
|
| 11 |
+
],
|
| 12 |
+
"down_block_types": [
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D",
|
| 16 |
+
"DownEncoderBlock2D"
|
| 17 |
+
],
|
| 18 |
+
"force_upcast": true,
|
| 19 |
+
"in_channels": 3,
|
| 20 |
+
"latent_channels": 4,
|
| 21 |
+
"latents_mean": null,
|
| 22 |
+
"latents_std": null,
|
| 23 |
+
"layers_per_block": 2,
|
| 24 |
+
"mid_block_add_attention": true,
|
| 25 |
+
"norm_num_groups": 32,
|
| 26 |
+
"out_channels": 3,
|
| 27 |
+
"sample_size": 256,
|
| 28 |
+
"scaling_factor": 0.18215,
|
| 29 |
+
"shift_factor": null,
|
| 30 |
+
"up_block_types": [
|
| 31 |
+
"UpDecoderBlock2D",
|
| 32 |
+
"UpDecoderBlock2D",
|
| 33 |
+
"UpDecoderBlock2D",
|
| 34 |
+
"UpDecoderBlock2D"
|
| 35 |
+
],
|
| 36 |
+
"use_post_quant_conv": true,
|
| 37 |
+
"use_quant_conv": true
|
| 38 |
+
}
|
data/models/handwriting/char_vocab.json
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"char_to_idx": {
|
| 3 |
+
"<PAD>": 0,
|
| 4 |
+
"<UNK>": 1,
|
| 5 |
+
"<SOS>": 2,
|
| 6 |
+
"<EOS>": 3,
|
| 7 |
+
" ": 4,
|
| 8 |
+
"!": 5,
|
| 9 |
+
"\"": 6,
|
| 10 |
+
"#": 7,
|
| 11 |
+
"&": 8,
|
| 12 |
+
"'": 9,
|
| 13 |
+
"(": 10,
|
| 14 |
+
")": 11,
|
| 15 |
+
"*": 12,
|
| 16 |
+
"+": 13,
|
| 17 |
+
",": 14,
|
| 18 |
+
"-": 15,
|
| 19 |
+
".": 16,
|
| 20 |
+
"/": 17,
|
| 21 |
+
"0": 18,
|
| 22 |
+
"1": 19,
|
| 23 |
+
"2": 20,
|
| 24 |
+
"3": 21,
|
| 25 |
+
"4": 22,
|
| 26 |
+
"5": 23,
|
| 27 |
+
"6": 24,
|
| 28 |
+
"7": 25,
|
| 29 |
+
"8": 26,
|
| 30 |
+
"9": 27,
|
| 31 |
+
":": 28,
|
| 32 |
+
";": 29,
|
| 33 |
+
"?": 30,
|
| 34 |
+
"A": 31,
|
| 35 |
+
"B": 32,
|
| 36 |
+
"C": 33,
|
| 37 |
+
"D": 34,
|
| 38 |
+
"E": 35,
|
| 39 |
+
"F": 36,
|
| 40 |
+
"G": 37,
|
| 41 |
+
"H": 38,
|
| 42 |
+
"I": 39,
|
| 43 |
+
"J": 40,
|
| 44 |
+
"K": 41,
|
| 45 |
+
"L": 42,
|
| 46 |
+
"M": 43,
|
| 47 |
+
"N": 44,
|
| 48 |
+
"O": 45,
|
| 49 |
+
"P": 46,
|
| 50 |
+
"Q": 47,
|
| 51 |
+
"R": 48,
|
| 52 |
+
"S": 49,
|
| 53 |
+
"T": 50,
|
| 54 |
+
"U": 51,
|
| 55 |
+
"V": 52,
|
| 56 |
+
"W": 53,
|
| 57 |
+
"X": 54,
|
| 58 |
+
"Y": 55,
|
| 59 |
+
"Z": 56,
|
| 60 |
+
"a": 57,
|
| 61 |
+
"b": 58,
|
| 62 |
+
"c": 59,
|
| 63 |
+
"d": 60,
|
| 64 |
+
"e": 61,
|
| 65 |
+
"f": 62,
|
| 66 |
+
"g": 63,
|
| 67 |
+
"h": 64,
|
| 68 |
+
"i": 65,
|
| 69 |
+
"j": 66,
|
| 70 |
+
"k": 67,
|
| 71 |
+
"l": 68,
|
| 72 |
+
"m": 69,
|
| 73 |
+
"n": 70,
|
| 74 |
+
"o": 71,
|
| 75 |
+
"p": 72,
|
| 76 |
+
"q": 73,
|
| 77 |
+
"r": 74,
|
| 78 |
+
"s": 75,
|
| 79 |
+
"t": 76,
|
| 80 |
+
"u": 77,
|
| 81 |
+
"v": 78,
|
| 82 |
+
"w": 79,
|
| 83 |
+
"x": 80,
|
| 84 |
+
"y": 81,
|
| 85 |
+
"z": 82
|
| 86 |
+
},
|
| 87 |
+
"max_length": 32,
|
| 88 |
+
"vocab_size": 83
|
| 89 |
+
}
|
data/models/handwriting/config.yaml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data:
|
| 2 |
+
batch_size: 64
|
| 3 |
+
num_workers: 8
|
| 4 |
+
train_lmdb_path: ./iam_lmdbclear
|
| 5 |
+
vocab_path: ./char_vocab.json
|
| 6 |
+
model:
|
| 7 |
+
latent_shape:
|
| 8 |
+
- 4
|
| 9 |
+
- 16
|
| 10 |
+
- 64
|
| 11 |
+
scheduler:
|
| 12 |
+
beta_end: 0.012
|
| 13 |
+
beta_schedule: linear
|
| 14 |
+
beta_start: 0.00085
|
| 15 |
+
num_train_timesteps: 1000
|
| 16 |
+
prediction_type: epsilon
|
| 17 |
+
text_encoder:
|
| 18 |
+
d_ff: 1024
|
| 19 |
+
d_model: 512
|
| 20 |
+
dropout: 0.1
|
| 21 |
+
max_length: 32
|
| 22 |
+
num_heads: 8
|
| 23 |
+
num_layers: 4
|
| 24 |
+
output_dim: 512
|
| 25 |
+
unet:
|
| 26 |
+
act_fn: silu
|
| 27 |
+
attention_head_dim: 8
|
| 28 |
+
block_out_channels:
|
| 29 |
+
- 192
|
| 30 |
+
- 384
|
| 31 |
+
- 768
|
| 32 |
+
- 768
|
| 33 |
+
cross_attention_dim: 512
|
| 34 |
+
down_block_types:
|
| 35 |
+
- DownBlock2D
|
| 36 |
+
- CrossAttnDownBlock2D
|
| 37 |
+
- CrossAttnDownBlock2D
|
| 38 |
+
- DownBlock2D
|
| 39 |
+
in_channels: 4
|
| 40 |
+
layers_per_block: 2
|
| 41 |
+
mid_block_type: UNetMidBlock2DCrossAttn
|
| 42 |
+
norm_num_groups: 32
|
| 43 |
+
num_class_embeds: 657
|
| 44 |
+
out_channels: 4
|
| 45 |
+
sample_size:
|
| 46 |
+
- 16
|
| 47 |
+
- 64
|
| 48 |
+
up_block_types:
|
| 49 |
+
- UpBlock2D
|
| 50 |
+
- CrossAttnUpBlock2D
|
| 51 |
+
- CrossAttnUpBlock2D
|
| 52 |
+
- UpBlock2D
|
| 53 |
+
vae:
|
| 54 |
+
model_name: stabilityai/sd-vae-ft-mse
|
| 55 |
+
training:
|
| 56 |
+
compile_model: false
|
| 57 |
+
ema_decay: 0.999
|
| 58 |
+
ema_inv_gamma: 1.0
|
| 59 |
+
ema_min_decay: 0.0
|
| 60 |
+
ema_power: 1.0
|
| 61 |
+
gradient_accumulation_steps: 1
|
| 62 |
+
log_every_n_steps: 10
|
| 63 |
+
lr_scheduler:
|
| 64 |
+
min_lr: 1.0e-07
|
| 65 |
+
type: cosine
|
| 66 |
+
warmup_steps: 2000
|
| 67 |
+
max_grad_norm: 1.0
|
| 68 |
+
mixed_precision: bf16
|
| 69 |
+
mode: latent
|
| 70 |
+
num_epochs: 300
|
| 71 |
+
num_inference_steps: 1000
|
| 72 |
+
optimizer:
|
| 73 |
+
beta1: 0.9
|
| 74 |
+
beta2: 0.999
|
| 75 |
+
eps: 1.0e-08
|
| 76 |
+
lr: 0.0001
|
| 77 |
+
type: adamw
|
| 78 |
+
weight_decay: 0.01
|
| 79 |
+
output_dir: ./experiments/hf_conditional_latent_batch64
|
| 80 |
+
resume_from_checkpoint: null
|
| 81 |
+
run_name: hf_conditional_latent_batch64
|
| 82 |
+
sample_every_n_steps: 18000
|
| 83 |
+
save_every_n_epochs: 10
|
| 84 |
+
seed: 42
|
| 85 |
+
use_channels_last: false
|
| 86 |
+
use_ema: true
|
| 87 |
+
wandb:
|
| 88 |
+
api_key:
|
| 89 |
+
entity: null
|
| 90 |
+
notes: Hugging Face UNet with EMA and latent diffusion training.
|
| 91 |
+
project: handwriting-diffusion
|
| 92 |
+
tags:
|
| 93 |
+
- hf
|
| 94 |
+
- conditional
|
| 95 |
+
- latent
|
data/models/handwriting/writer_id_map.json
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": 0,
|
| 3 |
+
"1": 1,
|
| 4 |
+
"10": 2,
|
| 5 |
+
"100": 3,
|
| 6 |
+
"102": 4,
|
| 7 |
+
"103": 5,
|
| 8 |
+
"104": 6,
|
| 9 |
+
"105": 7,
|
| 10 |
+
"106": 8,
|
| 11 |
+
"107": 9,
|
| 12 |
+
"108": 10,
|
| 13 |
+
"109": 11,
|
| 14 |
+
"11": 12,
|
| 15 |
+
"110": 13,
|
| 16 |
+
"111": 14,
|
| 17 |
+
"112": 15,
|
| 18 |
+
"113": 16,
|
| 19 |
+
"114": 17,
|
| 20 |
+
"115": 18,
|
| 21 |
+
"116": 19,
|
| 22 |
+
"117": 20,
|
| 23 |
+
"118": 21,
|
| 24 |
+
"119": 22,
|
| 25 |
+
"12": 23,
|
| 26 |
+
"120": 24,
|
| 27 |
+
"121": 25,
|
| 28 |
+
"122": 26,
|
| 29 |
+
"123": 27,
|
| 30 |
+
"124": 28,
|
| 31 |
+
"125": 29,
|
| 32 |
+
"126": 30,
|
| 33 |
+
"127": 31,
|
| 34 |
+
"128": 32,
|
| 35 |
+
"129": 33,
|
| 36 |
+
"13": 34,
|
| 37 |
+
"130": 35,
|
| 38 |
+
"131": 36,
|
| 39 |
+
"132": 37,
|
| 40 |
+
"133": 38,
|
| 41 |
+
"134": 39,
|
| 42 |
+
"135": 40,
|
| 43 |
+
"136": 41,
|
| 44 |
+
"137": 42,
|
| 45 |
+
"138": 43,
|
| 46 |
+
"139": 44,
|
| 47 |
+
"14": 45,
|
| 48 |
+
"140": 46,
|
| 49 |
+
"141": 47,
|
| 50 |
+
"142": 48,
|
| 51 |
+
"143": 49,
|
| 52 |
+
"144": 50,
|
| 53 |
+
"145": 51,
|
| 54 |
+
"146": 52,
|
| 55 |
+
"147": 53,
|
| 56 |
+
"148": 54,
|
| 57 |
+
"149": 55,
|
| 58 |
+
"15": 56,
|
| 59 |
+
"150": 57,
|
| 60 |
+
"151": 58,
|
| 61 |
+
"152": 59,
|
| 62 |
+
"153": 60,
|
| 63 |
+
"154": 61,
|
| 64 |
+
"155": 62,
|
| 65 |
+
"156": 63,
|
| 66 |
+
"157": 64,
|
| 67 |
+
"158": 65,
|
| 68 |
+
"159": 66,
|
| 69 |
+
"16": 67,
|
| 70 |
+
"160": 68,
|
| 71 |
+
"161": 69,
|
| 72 |
+
"162": 70,
|
| 73 |
+
"163": 71,
|
| 74 |
+
"164": 72,
|
| 75 |
+
"165": 73,
|
| 76 |
+
"166": 74,
|
| 77 |
+
"167": 75,
|
| 78 |
+
"168": 76,
|
| 79 |
+
"169": 77,
|
| 80 |
+
"17": 78,
|
| 81 |
+
"170": 79,
|
| 82 |
+
"171": 80,
|
| 83 |
+
"172": 81,
|
| 84 |
+
"173": 82,
|
| 85 |
+
"174": 83,
|
| 86 |
+
"175": 84,
|
| 87 |
+
"176": 85,
|
| 88 |
+
"177": 86,
|
| 89 |
+
"178": 87,
|
| 90 |
+
"179": 88,
|
| 91 |
+
"18": 89,
|
| 92 |
+
"180": 90,
|
| 93 |
+
"181": 91,
|
| 94 |
+
"182": 92,
|
| 95 |
+
"183": 93,
|
| 96 |
+
"184": 94,
|
| 97 |
+
"185": 95,
|
| 98 |
+
"186": 96,
|
| 99 |
+
"187": 97,
|
| 100 |
+
"188": 98,
|
| 101 |
+
"189": 99,
|
| 102 |
+
"19": 100,
|
| 103 |
+
"190": 101,
|
| 104 |
+
"191": 102,
|
| 105 |
+
"192": 103,
|
| 106 |
+
"193": 104,
|
| 107 |
+
"194": 105,
|
| 108 |
+
"195": 106,
|
| 109 |
+
"196": 107,
|
| 110 |
+
"197": 108,
|
| 111 |
+
"198": 109,
|
| 112 |
+
"199": 110,
|
| 113 |
+
"2": 111,
|
| 114 |
+
"20": 112,
|
| 115 |
+
"200": 113,
|
| 116 |
+
"201": 114,
|
| 117 |
+
"202": 115,
|
| 118 |
+
"203": 116,
|
| 119 |
+
"204": 117,
|
| 120 |
+
"205": 118,
|
| 121 |
+
"206": 119,
|
| 122 |
+
"207": 120,
|
| 123 |
+
"208": 121,
|
| 124 |
+
"209": 122,
|
| 125 |
+
"21": 123,
|
| 126 |
+
"210": 124,
|
| 127 |
+
"211": 125,
|
| 128 |
+
"212": 126,
|
| 129 |
+
"213": 127,
|
| 130 |
+
"214": 128,
|
| 131 |
+
"215": 129,
|
| 132 |
+
"216": 130,
|
| 133 |
+
"217": 131,
|
| 134 |
+
"218": 132,
|
| 135 |
+
"219": 133,
|
| 136 |
+
"22": 134,
|
| 137 |
+
"220": 135,
|
| 138 |
+
"221": 136,
|
| 139 |
+
"222": 137,
|
| 140 |
+
"223": 138,
|
| 141 |
+
"224": 139,
|
| 142 |
+
"225": 140,
|
| 143 |
+
"226": 141,
|
| 144 |
+
"227": 142,
|
| 145 |
+
"228": 143,
|
| 146 |
+
"229": 144,
|
| 147 |
+
"23": 145,
|
| 148 |
+
"230": 146,
|
| 149 |
+
"231": 147,
|
| 150 |
+
"232": 148,
|
| 151 |
+
"233": 149,
|
| 152 |
+
"234": 150,
|
| 153 |
+
"235": 151,
|
| 154 |
+
"236": 152,
|
| 155 |
+
"237": 153,
|
| 156 |
+
"238": 154,
|
| 157 |
+
"239": 155,
|
| 158 |
+
"24": 156,
|
| 159 |
+
"240": 157,
|
| 160 |
+
"241": 158,
|
| 161 |
+
"242": 159,
|
| 162 |
+
"243": 160,
|
| 163 |
+
"244": 161,
|
| 164 |
+
"245": 162,
|
| 165 |
+
"246": 163,
|
| 166 |
+
"247": 164,
|
| 167 |
+
"248": 165,
|
| 168 |
+
"249": 166,
|
| 169 |
+
"25": 167,
|
| 170 |
+
"250": 168,
|
| 171 |
+
"251": 169,
|
| 172 |
+
"252": 170,
|
| 173 |
+
"253": 171,
|
| 174 |
+
"254": 172,
|
| 175 |
+
"255": 173,
|
| 176 |
+
"256": 174,
|
| 177 |
+
"257": 175,
|
| 178 |
+
"258": 176,
|
| 179 |
+
"259": 177,
|
| 180 |
+
"26": 178,
|
| 181 |
+
"260": 179,
|
| 182 |
+
"261": 180,
|
| 183 |
+
"262": 181,
|
| 184 |
+
"263": 182,
|
| 185 |
+
"264": 183,
|
| 186 |
+
"265": 184,
|
| 187 |
+
"266": 185,
|
| 188 |
+
"267": 186,
|
| 189 |
+
"268": 187,
|
| 190 |
+
"269": 188,
|
| 191 |
+
"27": 189,
|
| 192 |
+
"270": 190,
|
| 193 |
+
"272": 191,
|
| 194 |
+
"273": 192,
|
| 195 |
+
"274": 193,
|
| 196 |
+
"275": 194,
|
| 197 |
+
"276": 195,
|
| 198 |
+
"277": 196,
|
| 199 |
+
"278": 197,
|
| 200 |
+
"279": 198,
|
| 201 |
+
"28": 199,
|
| 202 |
+
"280": 200,
|
| 203 |
+
"281": 201,
|
| 204 |
+
"282": 202,
|
| 205 |
+
"283": 203,
|
| 206 |
+
"285": 204,
|
| 207 |
+
"286": 205,
|
| 208 |
+
"287": 206,
|
| 209 |
+
"288": 207,
|
| 210 |
+
"289": 208,
|
| 211 |
+
"29": 209,
|
| 212 |
+
"290": 210,
|
| 213 |
+
"291": 211,
|
| 214 |
+
"292": 212,
|
| 215 |
+
"293": 213,
|
| 216 |
+
"294": 214,
|
| 217 |
+
"295": 215,
|
| 218 |
+
"296": 216,
|
| 219 |
+
"297": 217,
|
| 220 |
+
"298": 218,
|
| 221 |
+
"299": 219,
|
| 222 |
+
"3": 220,
|
| 223 |
+
"30": 221,
|
| 224 |
+
"300": 222,
|
| 225 |
+
"301": 223,
|
| 226 |
+
"302": 224,
|
| 227 |
+
"303": 225,
|
| 228 |
+
"304": 226,
|
| 229 |
+
"305": 227,
|
| 230 |
+
"307": 228,
|
| 231 |
+
"308": 229,
|
| 232 |
+
"309": 230,
|
| 233 |
+
"31": 231,
|
| 234 |
+
"310": 232,
|
| 235 |
+
"312": 233,
|
| 236 |
+
"313": 234,
|
| 237 |
+
"314": 235,
|
| 238 |
+
"315": 236,
|
| 239 |
+
"316": 237,
|
| 240 |
+
"317": 238,
|
| 241 |
+
"318": 239,
|
| 242 |
+
"319": 240,
|
| 243 |
+
"32": 241,
|
| 244 |
+
"320": 242,
|
| 245 |
+
"321": 243,
|
| 246 |
+
"322": 244,
|
| 247 |
+
"323": 245,
|
| 248 |
+
"324": 246,
|
| 249 |
+
"325": 247,
|
| 250 |
+
"326": 248,
|
| 251 |
+
"327": 249,
|
| 252 |
+
"328": 250,
|
| 253 |
+
"329": 251,
|
| 254 |
+
"33": 252,
|
| 255 |
+
"330": 253,
|
| 256 |
+
"331": 254,
|
| 257 |
+
"332": 255,
|
| 258 |
+
"333": 256,
|
| 259 |
+
"334": 257,
|
| 260 |
+
"335": 258,
|
| 261 |
+
"336": 259,
|
| 262 |
+
"337": 260,
|
| 263 |
+
"338": 261,
|
| 264 |
+
"339": 262,
|
| 265 |
+
"34": 263,
|
| 266 |
+
"340": 264,
|
| 267 |
+
"341": 265,
|
| 268 |
+
"342": 266,
|
| 269 |
+
"343": 267,
|
| 270 |
+
"344": 268,
|
| 271 |
+
"345": 269,
|
| 272 |
+
"346": 270,
|
| 273 |
+
"347": 271,
|
| 274 |
+
"348": 272,
|
| 275 |
+
"349": 273,
|
| 276 |
+
"35": 274,
|
| 277 |
+
"350": 275,
|
| 278 |
+
"351": 276,
|
| 279 |
+
"352": 277,
|
| 280 |
+
"353": 278,
|
| 281 |
+
"354": 279,
|
| 282 |
+
"355": 280,
|
| 283 |
+
"356": 281,
|
| 284 |
+
"357": 282,
|
| 285 |
+
"359": 283,
|
| 286 |
+
"36": 284,
|
| 287 |
+
"360": 285,
|
| 288 |
+
"361": 286,
|
| 289 |
+
"362": 287,
|
| 290 |
+
"363": 288,
|
| 291 |
+
"364": 289,
|
| 292 |
+
"365": 290,
|
| 293 |
+
"366": 291,
|
| 294 |
+
"367": 292,
|
| 295 |
+
"368": 293,
|
| 296 |
+
"369": 294,
|
| 297 |
+
"37": 295,
|
| 298 |
+
"370": 296,
|
| 299 |
+
"371": 297,
|
| 300 |
+
"372": 298,
|
| 301 |
+
"373": 299,
|
| 302 |
+
"375": 300,
|
| 303 |
+
"376": 301,
|
| 304 |
+
"377": 302,
|
| 305 |
+
"378": 303,
|
| 306 |
+
"379": 304,
|
| 307 |
+
"38": 305,
|
| 308 |
+
"380": 306,
|
| 309 |
+
"382": 307,
|
| 310 |
+
"383": 308,
|
| 311 |
+
"384": 309,
|
| 312 |
+
"385": 310,
|
| 313 |
+
"386": 311,
|
| 314 |
+
"387": 312,
|
| 315 |
+
"388": 313,
|
| 316 |
+
"389": 314,
|
| 317 |
+
"39": 315,
|
| 318 |
+
"390": 316,
|
| 319 |
+
"391": 317,
|
| 320 |
+
"392": 318,
|
| 321 |
+
"393": 319,
|
| 322 |
+
"394": 320,
|
| 323 |
+
"395": 321,
|
| 324 |
+
"396": 322,
|
| 325 |
+
"397": 323,
|
| 326 |
+
"398": 324,
|
| 327 |
+
"399": 325,
|
| 328 |
+
"4": 326,
|
| 329 |
+
"40": 327,
|
| 330 |
+
"400": 328,
|
| 331 |
+
"401": 329,
|
| 332 |
+
"402": 330,
|
| 333 |
+
"403": 331,
|
| 334 |
+
"404": 332,
|
| 335 |
+
"405": 333,
|
| 336 |
+
"406": 334,
|
| 337 |
+
"407": 335,
|
| 338 |
+
"408": 336,
|
| 339 |
+
"409": 337,
|
| 340 |
+
"41": 338,
|
| 341 |
+
"410": 339,
|
| 342 |
+
"411": 340,
|
| 343 |
+
"412": 341,
|
| 344 |
+
"413": 342,
|
| 345 |
+
"414": 343,
|
| 346 |
+
"415": 344,
|
| 347 |
+
"416": 345,
|
| 348 |
+
"417": 346,
|
| 349 |
+
"418": 347,
|
| 350 |
+
"419": 348,
|
| 351 |
+
"42": 349,
|
| 352 |
+
"420": 350,
|
| 353 |
+
"421": 351,
|
| 354 |
+
"422": 352,
|
| 355 |
+
"423": 353,
|
| 356 |
+
"424": 354,
|
| 357 |
+
"425": 355,
|
| 358 |
+
"426": 356,
|
| 359 |
+
"427": 357,
|
| 360 |
+
"428": 358,
|
| 361 |
+
"429": 359,
|
| 362 |
+
"43": 360,
|
| 363 |
+
"430": 361,
|
| 364 |
+
"431": 362,
|
| 365 |
+
"432": 363,
|
| 366 |
+
"433": 364,
|
| 367 |
+
"434": 365,
|
| 368 |
+
"435": 366,
|
| 369 |
+
"436": 367,
|
| 370 |
+
"439": 368,
|
| 371 |
+
"44": 369,
|
| 372 |
+
"440": 370,
|
| 373 |
+
"441": 371,
|
| 374 |
+
"442": 372,
|
| 375 |
+
"443": 373,
|
| 376 |
+
"444": 374,
|
| 377 |
+
"445": 375,
|
| 378 |
+
"446": 376,
|
| 379 |
+
"447": 377,
|
| 380 |
+
"448": 378,
|
| 381 |
+
"449": 379,
|
| 382 |
+
"45": 380,
|
| 383 |
+
"450": 381,
|
| 384 |
+
"451": 382,
|
| 385 |
+
"452": 383,
|
| 386 |
+
"453": 384,
|
| 387 |
+
"454": 385,
|
| 388 |
+
"455": 386,
|
| 389 |
+
"456": 387,
|
| 390 |
+
"457": 388,
|
| 391 |
+
"458": 389,
|
| 392 |
+
"459": 390,
|
| 393 |
+
"46": 391,
|
| 394 |
+
"460": 392,
|
| 395 |
+
"461": 393,
|
| 396 |
+
"462": 394,
|
| 397 |
+
"463": 395,
|
| 398 |
+
"464": 396,
|
| 399 |
+
"465": 397,
|
| 400 |
+
"466": 398,
|
| 401 |
+
"467": 399,
|
| 402 |
+
"468": 400,
|
| 403 |
+
"469": 401,
|
| 404 |
+
"47": 402,
|
| 405 |
+
"470": 403,
|
| 406 |
+
"471": 404,
|
| 407 |
+
"472": 405,
|
| 408 |
+
"473": 406,
|
| 409 |
+
"474": 407,
|
| 410 |
+
"475": 408,
|
| 411 |
+
"476": 409,
|
| 412 |
+
"477": 410,
|
| 413 |
+
"478": 411,
|
| 414 |
+
"479": 412,
|
| 415 |
+
"48": 413,
|
| 416 |
+
"480": 414,
|
| 417 |
+
"481": 415,
|
| 418 |
+
"482": 416,
|
| 419 |
+
"483": 417,
|
| 420 |
+
"484": 418,
|
| 421 |
+
"485": 419,
|
| 422 |
+
"486": 420,
|
| 423 |
+
"487": 421,
|
| 424 |
+
"488": 422,
|
| 425 |
+
"489": 423,
|
| 426 |
+
"49": 424,
|
| 427 |
+
"490": 425,
|
| 428 |
+
"491": 426,
|
| 429 |
+
"492": 427,
|
| 430 |
+
"493": 428,
|
| 431 |
+
"494": 429,
|
| 432 |
+
"495": 430,
|
| 433 |
+
"496": 431,
|
| 434 |
+
"497": 432,
|
| 435 |
+
"498": 433,
|
| 436 |
+
"499": 434,
|
| 437 |
+
"5": 435,
|
| 438 |
+
"50": 436,
|
| 439 |
+
"500": 437,
|
| 440 |
+
"501": 438,
|
| 441 |
+
"502": 439,
|
| 442 |
+
"503": 440,
|
| 443 |
+
"504": 441,
|
| 444 |
+
"505": 442,
|
| 445 |
+
"506": 443,
|
| 446 |
+
"508": 444,
|
| 447 |
+
"509": 445,
|
| 448 |
+
"51": 446,
|
| 449 |
+
"510": 447,
|
| 450 |
+
"511": 448,
|
| 451 |
+
"512": 449,
|
| 452 |
+
"513": 450,
|
| 453 |
+
"514": 451,
|
| 454 |
+
"515": 452,
|
| 455 |
+
"516": 453,
|
| 456 |
+
"517": 454,
|
| 457 |
+
"518": 455,
|
| 458 |
+
"519": 456,
|
| 459 |
+
"52": 457,
|
| 460 |
+
"520": 458,
|
| 461 |
+
"521": 459,
|
| 462 |
+
"522": 460,
|
| 463 |
+
"523": 461,
|
| 464 |
+
"524": 462,
|
| 465 |
+
"525": 463,
|
| 466 |
+
"526": 464,
|
| 467 |
+
"527": 465,
|
| 468 |
+
"528": 466,
|
| 469 |
+
"529": 467,
|
| 470 |
+
"53": 468,
|
| 471 |
+
"530": 469,
|
| 472 |
+
"531": 470,
|
| 473 |
+
"532": 471,
|
| 474 |
+
"533": 472,
|
| 475 |
+
"534": 473,
|
| 476 |
+
"535": 474,
|
| 477 |
+
"536": 475,
|
| 478 |
+
"537": 476,
|
| 479 |
+
"538": 477,
|
| 480 |
+
"539": 478,
|
| 481 |
+
"54": 479,
|
| 482 |
+
"540": 480,
|
| 483 |
+
"541": 481,
|
| 484 |
+
"542": 482,
|
| 485 |
+
"543": 483,
|
| 486 |
+
"544": 484,
|
| 487 |
+
"545": 485,
|
| 488 |
+
"546": 486,
|
| 489 |
+
"547": 487,
|
| 490 |
+
"548": 488,
|
| 491 |
+
"549": 489,
|
| 492 |
+
"55": 490,
|
| 493 |
+
"550": 491,
|
| 494 |
+
"551": 492,
|
| 495 |
+
"552": 493,
|
| 496 |
+
"553": 494,
|
| 497 |
+
"554": 495,
|
| 498 |
+
"555": 496,
|
| 499 |
+
"556": 497,
|
| 500 |
+
"557": 498,
|
| 501 |
+
"558": 499,
|
| 502 |
+
"559": 500,
|
| 503 |
+
"56": 501,
|
| 504 |
+
"560": 502,
|
| 505 |
+
"561": 503,
|
| 506 |
+
"562": 504,
|
| 507 |
+
"563": 505,
|
| 508 |
+
"564": 506,
|
| 509 |
+
"565": 507,
|
| 510 |
+
"566": 508,
|
| 511 |
+
"567": 509,
|
| 512 |
+
"568": 510,
|
| 513 |
+
"569": 511,
|
| 514 |
+
"570": 512,
|
| 515 |
+
"571": 513,
|
| 516 |
+
"572": 514,
|
| 517 |
+
"573": 515,
|
| 518 |
+
"574": 516,
|
| 519 |
+
"575": 517,
|
| 520 |
+
"576": 518,
|
| 521 |
+
"577": 519,
|
| 522 |
+
"578": 520,
|
| 523 |
+
"579": 521,
|
| 524 |
+
"58": 522,
|
| 525 |
+
"580": 523,
|
| 526 |
+
"581": 524,
|
| 527 |
+
"582": 525,
|
| 528 |
+
"583": 526,
|
| 529 |
+
"584": 527,
|
| 530 |
+
"585": 528,
|
| 531 |
+
"586": 529,
|
| 532 |
+
"587": 530,
|
| 533 |
+
"588": 531,
|
| 534 |
+
"589": 532,
|
| 535 |
+
"59": 533,
|
| 536 |
+
"590": 534,
|
| 537 |
+
"591": 535,
|
| 538 |
+
"592": 536,
|
| 539 |
+
"593": 537,
|
| 540 |
+
"594": 538,
|
| 541 |
+
"595": 539,
|
| 542 |
+
"596": 540,
|
| 543 |
+
"597": 541,
|
| 544 |
+
"598": 542,
|
| 545 |
+
"599": 543,
|
| 546 |
+
"6": 544,
|
| 547 |
+
"60": 545,
|
| 548 |
+
"600": 546,
|
| 549 |
+
"601": 547,
|
| 550 |
+
"602": 548,
|
| 551 |
+
"603": 549,
|
| 552 |
+
"604": 550,
|
| 553 |
+
"605": 551,
|
| 554 |
+
"606": 552,
|
| 555 |
+
"607": 553,
|
| 556 |
+
"608": 554,
|
| 557 |
+
"609": 555,
|
| 558 |
+
"61": 556,
|
| 559 |
+
"610": 557,
|
| 560 |
+
"611": 558,
|
| 561 |
+
"612": 559,
|
| 562 |
+
"613": 560,
|
| 563 |
+
"614": 561,
|
| 564 |
+
"615": 562,
|
| 565 |
+
"616": 563,
|
| 566 |
+
"617": 564,
|
| 567 |
+
"618": 565,
|
| 568 |
+
"619": 566,
|
| 569 |
+
"62": 567,
|
| 570 |
+
"620": 568,
|
| 571 |
+
"621": 569,
|
| 572 |
+
"622": 570,
|
| 573 |
+
"623": 571,
|
| 574 |
+
"624": 572,
|
| 575 |
+
"625": 573,
|
| 576 |
+
"626": 574,
|
| 577 |
+
"627": 575,
|
| 578 |
+
"628": 576,
|
| 579 |
+
"629": 577,
|
| 580 |
+
"63": 578,
|
| 581 |
+
"630": 579,
|
| 582 |
+
"631": 580,
|
| 583 |
+
"632": 581,
|
| 584 |
+
"633": 582,
|
| 585 |
+
"634": 583,
|
| 586 |
+
"635": 584,
|
| 587 |
+
"636": 585,
|
| 588 |
+
"637": 586,
|
| 589 |
+
"638": 587,
|
| 590 |
+
"639": 588,
|
| 591 |
+
"64": 589,
|
| 592 |
+
"640": 590,
|
| 593 |
+
"641": 591,
|
| 594 |
+
"642": 592,
|
| 595 |
+
"643": 593,
|
| 596 |
+
"644": 594,
|
| 597 |
+
"645": 595,
|
| 598 |
+
"647": 596,
|
| 599 |
+
"648": 597,
|
| 600 |
+
"649": 598,
|
| 601 |
+
"65": 599,
|
| 602 |
+
"650": 600,
|
| 603 |
+
"651": 601,
|
| 604 |
+
"652": 602,
|
| 605 |
+
"653": 603,
|
| 606 |
+
"654": 604,
|
| 607 |
+
"655": 605,
|
| 608 |
+
"658": 606,
|
| 609 |
+
"659": 607,
|
| 610 |
+
"66": 608,
|
| 611 |
+
"660": 609,
|
| 612 |
+
"661": 610,
|
| 613 |
+
"662": 611,
|
| 614 |
+
"663": 612,
|
| 615 |
+
"664": 613,
|
| 616 |
+
"665": 614,
|
| 617 |
+
"666": 615,
|
| 618 |
+
"667": 616,
|
| 619 |
+
"668": 617,
|
| 620 |
+
"669": 618,
|
| 621 |
+
"67": 619,
|
| 622 |
+
"670": 620,
|
| 623 |
+
"671": 621,
|
| 624 |
+
"68": 622,
|
| 625 |
+
"69": 623,
|
| 626 |
+
"7": 624,
|
| 627 |
+
"70": 625,
|
| 628 |
+
"71": 626,
|
| 629 |
+
"72": 627,
|
| 630 |
+
"73": 628,
|
| 631 |
+
"74": 629,
|
| 632 |
+
"75": 630,
|
| 633 |
+
"76": 631,
|
| 634 |
+
"77": 632,
|
| 635 |
+
"78": 633,
|
| 636 |
+
"79": 634,
|
| 637 |
+
"8": 635,
|
| 638 |
+
"80": 636,
|
| 639 |
+
"81": 637,
|
| 640 |
+
"82": 638,
|
| 641 |
+
"83": 639,
|
| 642 |
+
"84": 640,
|
| 643 |
+
"85": 641,
|
| 644 |
+
"86": 642,
|
| 645 |
+
"87": 643,
|
| 646 |
+
"88": 644,
|
| 647 |
+
"89": 645,
|
| 648 |
+
"9": 646,
|
| 649 |
+
"90": 647,
|
| 650 |
+
"91": 648,
|
| 651 |
+
"92": 649,
|
| 652 |
+
"93": 650,
|
| 653 |
+
"94": 651,
|
| 654 |
+
"95": 652,
|
| 655 |
+
"96": 653,
|
| 656 |
+
"97": 654,
|
| 657 |
+
"98": 655,
|
| 658 |
+
"99": 656
|
| 659 |
+
}
|
data/prompt_templates/Adaptation_GT/seed-based.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI specialized in generating unique HTML
|
| 2 |
+
documents based on multiple scanned images of realworld examples. You have been provided with distinct
|
| 3 |
+
sample images, each from a different cultural or regional
|
| 4 |
+
background. You have been provided seed images of
|
| 5 |
+
{doc type}, each originating from different cultural or regional contexts. For example, some might feature:
|
| 6 |
+
• Local languages or regional disclaimers
|
| 7 |
+
• Different date formats (e.g., dd/mm/yyyy vs. mm/dd/yyyy)
|
| 8 |
+
• Unique currency or numbering formats
|
| 9 |
+
• Varying layout norms (positions of key fields, disclaimers, official stamps, etc.)
|
| 10 |
+
Now, please generate {num solutions} unique HTML
|
| 11 |
+
documents that:
|
| 12 |
+
1. Strictly reflect the overall style, layout, and cultural
|
| 13 |
+
cues found in these samples, but do NOT copy any text,
|
| 14 |
+
disclaimers, or layout verbatim from the samples.
|
| 15 |
+
2. Include any essential mandatory fields: {sections}.
|
| 16 |
+
3. Maintain an A4 size format for printing (using @page
|
| 17 |
+
{{ size: A4; }} or similar CSS).
|
| 18 |
+
4. Maintain a {background requirements}.
|
| 19 |
+
5. Avoid copy-pasting or reusing large chunks of HTML,
|
| 20 |
+
CSS, or disclaimers—each document must be at least
|
| 21 |
+
70% different in code and text than the others.
|
| 22 |
+
6. Strictly wrap each new document in
|
| 23 |
+
<HTML>...</HTML> tags, for example:
|
| 24 |
+
1. <HTML>...Solution #1...</HTML>
|
| 25 |
+
2. <HTML>...Solution #2...</HTML>
|
| 26 |
+
...
|
| 27 |
+
{num solutions}. <HTML>...Solution
|
| 28 |
+
#{num solutions}...</HTML>
|
| 29 |
+
Additional Requirements: {user descriptions}
|
| 30 |
+
Include the {gt type} as JSON in the document via <script type="application/json" id="GT">...</script> in the following format: {gt example}
|
| 31 |
+
Notes:
|
| 32 |
+
• Pay close attention to cultural/regional differences seen
|
| 33 |
+
in the seed images (e.g., language, format, disclaimers).
|
| 34 |
+
• Feel free to creatively adapt or combine stylistic cues
|
| 35 |
+
from the seeds, as long as the end result looks authentic
|
| 36 |
+
for that cultural context.
|
| 37 |
+
• Do NOT directly copy-paste text or entire code blocks
|
| 38 |
+
from any single seed image or across these new solutions.
|
| 39 |
+
Now please generate the {num solutions} distinct
|
| 40 |
+
{doc type} documents.
|
data/prompt_templates/Adaptation_GT/seed-free.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI specialized in generating multiple unique
|
| 2 |
+
HTML documents in one response. Please create
|
| 3 |
+
{num solutions} unique HTML documents representing
|
| 4 |
+
{doc type}.
|
| 5 |
+
Each solution must:
|
| 6 |
+
1. Include all mandatory fields: {sections}.
|
| 7 |
+
2. Be formatted so it could print on A4 (e.g., use @page
|
| 8 |
+
{{ size: A4; }} in your CSS).
|
| 9 |
+
3. Show a significantly different layout, styling, and textual content from every other solution.
|
| 10 |
+
4. Maintain a {background requirements}.
|
| 11 |
+
5. Avoid copy-pasting or reusing large chunks of HTML,
|
| 12 |
+
CSS, or disclaimers—each document must be at least
|
| 13 |
+
70% different in code and text than the others.
|
| 14 |
+
6. Wrap each complete document between <HTML>
|
| 15 |
+
and </HTML> tags, labeled as:
|
| 16 |
+
1. <HTML>...Solution #1...</HTML>
|
| 17 |
+
2. <HTML>...Solution #2...</HTML>
|
| 18 |
+
...
|
| 19 |
+
{num solutions}. <HTML>...Solution
|
| 20 |
+
#{num solutions}...</HTML>
|
| 21 |
+
Include the {gt type} as JSON in the document via <script type="application/json" id="GT">...</script> in the following format: {gt example}
|
| 22 |
+
Do not provide additional commentary or references to the
|
| 23 |
+
other solutions within each HTML.
|
| 24 |
+
Now generate the {num solutions} distinct {doc type}
|
| 25 |
+
documents.
|
data/prompt_templates/ClaudeRefined1/seed-based.txt
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HTML Document Generation Prompt (Refined)
|
| 2 |
+
|
| 3 |
+
You are an AI specialized in creating culturally authentic HTML documents based on visual analysis of real-world examples. You have been provided with {num_seed_images} seed images of **{doc_type}** documents from different cultural and regional contexts.
|
| 4 |
+
|
| 5 |
+
## Cultural Variations (If Present)
|
| 6 |
+
The seed images may demonstrate regional differences such as:
|
| 7 |
+
- Language variations and local terminology
|
| 8 |
+
- Date formatting conventions (DD/MM/YYYY, MM/DD/YYYY, etc.)
|
| 9 |
+
- Currency symbols and number formatting
|
| 10 |
+
- Layout preferences (field positioning, official elements, cultural design patterns)
|
| 11 |
+
- Regional legal disclaimers and regulatory requirements
|
| 12 |
+
- Typography and visual hierarchy standards
|
| 13 |
+
|
| 14 |
+
## Task Requirements
|
| 15 |
+
Generate **{num_solutions}** unique HTML documents that meet these specifications:
|
| 16 |
+
|
| 17 |
+
### Core Requirements
|
| 18 |
+
1. **Cultural Authenticity**: If cultural/regional variations are present in the seed images, reflect those stylistic elements without directly copying any text, disclaimers, or layouts verbatim
|
| 19 |
+
2. **Required Content**: Include all essential fields: {required_sections}
|
| 20 |
+
3. **Single Page Format**: Design as single-page documents with dimensions appropriate to the document type (receipts: narrow format, forms: standard width, etc.)
|
| 21 |
+
4. **Language**: Generate all content in {language}
|
| 22 |
+
5. **Background**: {background_requirements}
|
| 23 |
+
6. **Uniqueness**: Each document must be at least 70% different in code structure, styling, and content from others
|
| 24 |
+
|
| 25 |
+
## Ground Truth Generation
|
| 26 |
+
Generate appropriate ground truth data for each document: {gt_type}.
|
| 27 |
+
Include the ground truth as JSON inside each document in a `<script type="application/json" id="GT">...</script>` tag.
|
| 28 |
+
The ground truth must follow the format: {gt_format}
|
| 29 |
+
|
| 30 |
+
### Technical Specifications
|
| 31 |
+
- Wrap each solution in `<HTML>...</HTML>` tags numbered sequentially
|
| 32 |
+
- Include the ground truth JSON in `<script type="application/json" id="GT">...</script>` as specified above
|
| 33 |
+
- Implement static CSS appropriate for the document type and single-page layout (no animations, transitions, or dynamic effects)
|
| 34 |
+
|
| 35 |
+
## Additional Requirements
|
| 36 |
+
{user_descriptions}
|
| 37 |
+
|
| 38 |
+
### Content Guidelines
|
| 39 |
+
- **DO**: Adapt any cultural/regional stylistic elements present in the seed images
|
| 40 |
+
- **DO**: Create authentic-feeling content appropriate to each cultural context
|
| 41 |
+
- **DO**: Vary layout structures, color schemes, and typographic choices
|
| 42 |
+
- **DO**: Use static styling only (no animations, hover effects, or transitions)
|
| 43 |
+
- **DON'T**: Copy-paste text, code blocks, or entire sections between solutions
|
| 44 |
+
- **DON'T**: Reuse identical disclaimers, headers, or formatting patterns
|
| 45 |
+
- **DON'T**: Include any dynamic effects, animations, or interactive elements
|
| 46 |
+
|
| 47 |
+
## Additional Requirements
|
| 48 |
+
{user_descriptions}
|
| 49 |
+
|
| 50 |
+
## Output Format
|
| 51 |
+
Structure your response as:
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
1. <HTML>
|
| 55 |
+
<!-- Solution #1 with cultural context A -->
|
| 56 |
+
...complete HTML document...
|
| 57 |
+
</HTML>
|
| 58 |
+
|
| 59 |
+
2. <HTML>
|
| 60 |
+
<!-- Solution #2 with cultural context B -->
|
| 61 |
+
...complete HTML document...
|
| 62 |
+
</HTML>
|
| 63 |
+
|
| 64 |
+
...continue for all {num_solutions} solutions
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Quality Checklist
|
| 68 |
+
Before generating, ensure each document:
|
| 69 |
+
- [ ] Reflects any authentic cultural/regional characteristics present in seed images
|
| 70 |
+
- [ ] Contains all required sections: {required_sections}
|
| 71 |
+
- [ ] Uses static styling only (no animations or dynamic effects)
|
| 72 |
+
- [ ] Uses appropriate single-page formatting for the document type
|
| 73 |
+
- [ ] All content is in English
|
| 74 |
+
- [ ] Includes the specified ground truth in proper JSON format
|
| 75 |
+
- [ ] Maintains 70%+ uniqueness from other solutions
|
| 76 |
+
- [ ] Follows semantic HTML best practices
|
| 77 |
+
|
| 78 |
+
Now generate the **{num_solutions}** distinct **{doc_type}** documents.
|
data/prompt_templates/ClaudeRefined10/seed-based.txt
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI creating authentic HTML representations of documents based on seed images.
|
| 2 |
+
Analyze the seed images for structural and semantic content and generate authentic variations.
|
| 3 |
+
The generated documents will be printed.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
|
| 7 |
+
2. **Format**: Single-page documents with dimensions appropriate to the document type
|
| 8 |
+
3. **Language**: {language}
|
| 9 |
+
4. **Static Only**: No animations, transitions, or dynamic effects
|
| 10 |
+
|
| 11 |
+
## Technical
|
| 12 |
+
- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
|
| 13 |
+
- Static CSS only for single-page layout
|
| 14 |
+
- Generate only minified CSS, HTML, JS.
|
| 15 |
+
|
| 16 |
+
## Content Guidelines
|
| 17 |
+
**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
|
| 18 |
+
**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
|
| 19 |
+
|
| 20 |
+
## Handwritten Fields (if document type requires)
|
| 21 |
+
- Mark with class 'handwritten'
|
| 22 |
+
- Apply generously increased size to 'handwritten', in line with realistic handwriting
|
| 23 |
+
- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
|
| 24 |
+
- Never include signatures as handwriting
|
| 25 |
+
|
| 26 |
+
## Visual Placeholders (if document type requires)
|
| 27 |
+
- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropiate positions
|
| 28 |
+
- Valid types are: signature, stamp, logo, barcode, photo, chart
|
| 29 |
+
- Add data-content attribute with actual content description
|
| 30 |
+
- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people and ensure the author is semantically coherent with the document content
|
| 31 |
+
- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
|
| 32 |
+
- Dimensions in mm/cm, e.g. `width:30mm;height:20mm;`
|
| 33 |
+
- Example: `<div data-placeholder="signature" data-content="john" class="author1" style="width:50mm;height:15mm;"></div>`
|
| 34 |
+
- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
|
| 35 |
+
- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
|
| 36 |
+
|
| 37 |
+
## Output Format
|
| 38 |
+
Generate minified HTML like this:
|
| 39 |
+
```
|
| 40 |
+
1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
|
| 41 |
+
2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
|
| 42 |
+
...
|
| 43 |
+
```
|
| 44 |
+
## Ground Truth
|
| 45 |
+
Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
|
| 46 |
+
Ground truth specification: {gt_type}
|
| 47 |
+
Ground truth must follow the format: {gt_format}
|
| 48 |
+
|
| 49 |
+
## Quality Checklist
|
| 50 |
+
- [ ] Authentic variations without verbatim copying from seed images
|
| 51 |
+
- [ ] Static styling only (no animations or dynamic effects)
|
| 52 |
+
- [ ] Single-page format with minified HTML/CSS/JS
|
| 53 |
+
- [ ] Content in {language}
|
| 54 |
+
- [ ] GT JSON present and correctly formatted
|
| 55 |
+
- [ ] Visual elements are semantically coherent
|
| 56 |
+
|
| 57 |
+
Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
|
data/prompt_templates/ClaudeRefined11/seed-based.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI creating authentic HTML representations of documents based on seed images.
|
| 2 |
+
Analyze the seed images for structural and semantic content and generate authentic variations.
|
| 3 |
+
The generated documents will be printed.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
|
| 7 |
+
2. **Format**: Single-page documents with dimensions appropriate to the document type
|
| 8 |
+
3. **Language**: {language}
|
| 9 |
+
4. **Static Only**: No animations, transitions, or dynamic effects
|
| 10 |
+
|
| 11 |
+
## Technical
|
| 12 |
+
- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
|
| 13 |
+
- Static CSS only for single-page layout
|
| 14 |
+
- Generate only minified CSS, HTML, JS.
|
| 15 |
+
|
| 16 |
+
## Content Guidelines
|
| 17 |
+
**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
|
| 18 |
+
**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
|
| 19 |
+
|
| 20 |
+
## Handwritten Fields (if document type requires)
|
| 21 |
+
- Mark with class 'handwritten' and use regular text
|
| 22 |
+
- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting
|
| 23 |
+
- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
|
| 24 |
+
- If the handwriting represents a signature mark it additionally with class 'signature'
|
| 25 |
+
|
| 26 |
+
## Visual Placeholders (if document type requires)
|
| 27 |
+
- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropriate positions
|
| 28 |
+
- Valid types are: stamp, logo, barcode, photo, chart
|
| 29 |
+
- Add data-content attribute with actual content description
|
| 30 |
+
- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
|
| 31 |
+
- Always provide dimensions in mm/cm, e.g. `width:30mm;height:20mm;`
|
| 32 |
+
- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
|
| 33 |
+
- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
|
| 34 |
+
|
| 35 |
+
## Output Format
|
| 36 |
+
Generate minified HTML like this:
|
| 37 |
+
```
|
| 38 |
+
1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
|
| 39 |
+
2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
|
| 40 |
+
...
|
| 41 |
+
```
|
| 42 |
+
## Ground Truth
|
| 43 |
+
Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
|
| 44 |
+
Ground truth specification: {gt_type}
|
| 45 |
+
Ground truth must follow the format: {gt_format}
|
| 46 |
+
|
| 47 |
+
## Quality Checklist
|
| 48 |
+
- [ ] Authentic variations without verbatim copying from seed images
|
| 49 |
+
- [ ] Static styling only (no animations or dynamic effects)
|
| 50 |
+
- [ ] Single-page format with minified HTML/CSS/JS
|
| 51 |
+
- [ ] Content in {language}
|
| 52 |
+
- [ ] GT JSON present and correctly formatted
|
| 53 |
+
- [ ] Visual elements are semantically coherent
|
| 54 |
+
|
| 55 |
+
Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
|
data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI creating authentic HTML representations of documents based on seed images.
|
| 2 |
+
Analyze the seed images for structural and semantic content and generate authentic variations.
|
| 3 |
+
The generated documents will be printed.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
|
| 7 |
+
2. **Format**: Single-page documents with dimensions appropriate to the document type
|
| 8 |
+
3. **Language**: {language}
|
| 9 |
+
4. **Static Only**: No animations, transitions, or dynamic effects
|
| 10 |
+
|
| 11 |
+
## Technical
|
| 12 |
+
- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
|
| 13 |
+
- Static CSS only for single-page layout
|
| 14 |
+
- Generate only minified CSS, HTML, JS.
|
| 15 |
+
|
| 16 |
+
## Content Guidelines
|
| 17 |
+
**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
|
| 18 |
+
**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
|
| 19 |
+
|
| 20 |
+
## Handwritten Fields (if document type requires)
|
| 21 |
+
- Mark with class 'handwritten' and use regular text
|
| 22 |
+
- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting
|
| 23 |
+
- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
|
| 24 |
+
- If the handwriting represents a signature mark it additionally with class 'signature'
|
| 25 |
+
|
| 26 |
+
## Visual Placeholders (if document type requires)
|
| 27 |
+
- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropriate positions
|
| 28 |
+
- Valid types are: stamp, logo, figure, barcode, photo
|
| 29 |
+
- Add data-content attribute with actual content description
|
| 30 |
+
- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
|
| 31 |
+
- Always provide appropiate dimensions
|
| 32 |
+
- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
|
| 33 |
+
- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
|
| 34 |
+
|
| 35 |
+
## Output Format
|
| 36 |
+
Generate minified HTML like this:
|
| 37 |
+
```
|
| 38 |
+
1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
|
| 39 |
+
2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
|
| 40 |
+
...
|
| 41 |
+
```
|
| 42 |
+
## Ground Truth
|
| 43 |
+
Generate ground truth by assigning each applicable element in HTML a class from the list below to uniquely identify its label:
|
| 44 |
+
{gt_type}
|
| 45 |
+
{gt_format}
|
| 46 |
+
|
| 47 |
+
## Quality Checklist
|
| 48 |
+
- [ ] Authentic variations without verbatim copying from seed images
|
| 49 |
+
- [ ] Static styling only (no animations or dynamic effects)
|
| 50 |
+
- [ ] Single-page format with minified HTML/CSS
|
| 51 |
+
- [ ] Content in {language}
|
| 52 |
+
- [ ] GT labels via class annotations are present and assigned to correct elements
|
| 53 |
+
- [ ] Visual elements are semantically coherent
|
| 54 |
+
|
| 55 |
+
Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
|
data/prompt_templates/ClaudeRefined12/seed-based-json.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI creating authentic HTML representations of documents based on seed images.
|
| 2 |
+
Analyze the seed images for structural and semantic content and generate authentic variations.
|
| 3 |
+
The generated documents will be printed.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
|
| 7 |
+
2. **Format**: Single-page documents with dimensions appropriate to the document type
|
| 8 |
+
3. **Language**: {language}
|
| 9 |
+
4. **Static Only**: No animations, transitions, or dynamic effects
|
| 10 |
+
|
| 11 |
+
## Technical
|
| 12 |
+
- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
|
| 13 |
+
- Static CSS only for single-page layout
|
| 14 |
+
- Generate only minified CSS, HTML, JS.
|
| 15 |
+
|
| 16 |
+
## Content Guidelines
|
| 17 |
+
**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
|
| 18 |
+
**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
|
| 19 |
+
|
| 20 |
+
## Handwritten Fields (if document type requires)
|
| 21 |
+
- Mark with class 'handwritten' and use regular text
|
| 22 |
+
- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting
|
| 23 |
+
- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
|
| 24 |
+
- If the handwriting represents a signature mark it additionally with class 'signature'
|
| 25 |
+
|
| 26 |
+
## Visual Placeholders (if document type requires)
|
| 27 |
+
- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropriate positions
|
| 28 |
+
- Valid types are: stamp, logo, figure, barcode, photo
|
| 29 |
+
- Add data-content attribute with actual content description
|
| 30 |
+
- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
|
| 31 |
+
- Always provide appropiate dimensions
|
| 32 |
+
- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
|
| 33 |
+
- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
|
| 34 |
+
|
| 35 |
+
## Output Format
|
| 36 |
+
Generate minified HTML like this:
|
| 37 |
+
```
|
| 38 |
+
1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
|
| 39 |
+
2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
|
| 40 |
+
...
|
| 41 |
+
```
|
| 42 |
+
## Ground Truth
|
| 43 |
+
Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
|
| 44 |
+
Ground truth specification: {gt_type}
|
| 45 |
+
Ground truth must follow the format: {gt_format}
|
| 46 |
+
|
| 47 |
+
## Quality Checklist
|
| 48 |
+
- [ ] Authentic variations without verbatim copying from seed images
|
| 49 |
+
- [ ] Static styling only (no animations or dynamic effects)
|
| 50 |
+
- [ ] Single-page format with minified HTML/CSS
|
| 51 |
+
- [ ] Content in {language}
|
| 52 |
+
- [ ] GT JSON present, correctly formatted and semantically coherent
|
| 53 |
+
- [ ] Visual elements are semantically coherent
|
| 54 |
+
|
| 55 |
+
Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
|