docling-parser / config.py
Ibad ur Rehman
feat: add parser load shaping and shared executors
3c878c2
"""Configuration, environment variables, and logging setup for the parser."""
import logging
import os
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)-8s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("docling-parser")
API_TOKEN = os.getenv("API_TOKEN")
CPU_COUNT = os.cpu_count() or 4
def _env_int(name: str, default: int) -> int:
return int(os.getenv(name, str(default)))
def _env_float(name: str, default: float) -> float:
return float(os.getenv(name, str(default)))
IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
DOCLING_DEVICE = os.getenv("DOCLING_DEVICE", "auto")
DOCLING_NUM_THREADS = _env_int("DOCLING_NUM_THREADS", min(4, CPU_COUNT))
BITMAP_AREA_THRESHOLD = float(os.getenv("BITMAP_AREA_THRESHOLD", "0.05"))
SPARSE_TEXT_THRESHOLD = int(os.getenv("SPARSE_TEXT_THRESHOLD", "80"))
IMAGE_DOMINANT_THRESHOLD = float(os.getenv("IMAGE_DOMINANT_THRESHOLD", "0.75"))
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
GEMINI_CONCURRENCY = _env_int(
"GEMINI_CONCURRENCY",
max(2, min(4, max(1, CPU_COUNT // 2))),
)
# Figure transcription (Pass 2.5)
# Disabled by default — enabling changes markdown output, latency, and Gemini
# token spend. Flip `TRANSCRIBE_IMAGES=true` in HF env after spot-checking, or
# override per request via the `transcribe_images` form/JSON field.
TRANSCRIBE_IMAGES = os.getenv("TRANSCRIBE_IMAGES", "false").lower() in {"1", "true", "yes"}
IMAGE_TRANSCRIPTION_MIN_PX = _env_int("IMAGE_TRANSCRIPTION_MIN_PX", 150)
IMAGE_TRANSCRIPTION_MIN_AREA_RATIO = _env_float(
"IMAGE_TRANSCRIPTION_MIN_AREA_RATIO", 0.02
)
GEMINI_IMAGE_CONCURRENCY = _env_int(
"GEMINI_IMAGE_CONCURRENCY",
max(1, min(2, max(1, CPU_COUNT // 4))),
)
# Hard per-request ceiling on figure transcription calls. A large PDF with
# many qualifying pictures would otherwise turn into hundreds of Gemini
# calls. When exceeded, the largest (by bbox area) are kept and the rest
# dropped with a WARNING log. Set to 0 to disable the cap.
MAX_IMAGE_TRANSCRIPTIONS = _env_int("MAX_IMAGE_TRANSCRIPTIONS", 50)
# Concurrency tuning
# THREAD_POOL_SIZE: default executor for blocking filesystem and pipeline
# work. Keep this conservative so queued requests wait instead of turning
# into CPU contention.
# EXCEL_CONCURRENCY: semaphore cap on simultaneous Excel jobs. Prevents OOM
# when many large workbooks arrive at once (openpyxl loads full file into RAM).
THREAD_POOL_SIZE = _env_int("THREAD_POOL_SIZE", max(8, min(12, CPU_COUNT)))
EXCEL_CONCURRENCY = _env_int(
"EXCEL_CONCURRENCY",
max(2, min(6, max(1, CPU_COUNT // 2))),
)
RENDER_CONCURRENCY = _env_int(
"RENDER_CONCURRENCY",
max(2, min(4, max(1, CPU_COUNT // 2))),
)
# PDF load shaping: queued admission control for the heaviest stage
# (Docling conversion). Capacity is in abstract "work units" so a process can
# run multiple small PDFs concurrently while larger PDFs naturally wait.
PDF_PARSE_CAPACITY_UNITS = _env_int(
"PDF_PARSE_CAPACITY_UNITS",
max(1, min(2, max(1, CPU_COUNT // 4))),
)
PDF_PAGES_PER_QUEUE_UNIT = _env_int("PDF_PAGES_PER_QUEUE_UNIT", 25)
PDF_PARSE_QUEUE_TIMEOUT = _env_float("PDF_PARSE_QUEUE_TIMEOUT", 600.0)
BLOCKED_HOSTNAMES = {
"localhost",
"metadata",
"metadata.google.internal",
"metadata.google",
"169.254.169.254",
"fd00:ec2::254",
}