Spaces:
Sleeping
Sleeping
| """Configuration, environment variables, and logging setup for the parser.""" | |
| import logging | |
| import os | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)-8s | %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| logger = logging.getLogger("docling-parser") | |
| API_TOKEN = os.getenv("API_TOKEN") | |
| CPU_COUNT = os.cpu_count() or 4 | |
| def _env_int(name: str, default: int) -> int: | |
| return int(os.getenv(name, str(default))) | |
| def _env_float(name: str, default: float) -> float: | |
| return float(os.getenv(name, str(default))) | |
| IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0")) | |
| MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024")) | |
| MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 | |
| RENDER_DPI = int(os.getenv("RENDER_DPI", "200")) | |
| DOCLING_DEVICE = os.getenv("DOCLING_DEVICE", "auto") | |
| DOCLING_NUM_THREADS = _env_int("DOCLING_NUM_THREADS", min(4, CPU_COUNT)) | |
| BITMAP_AREA_THRESHOLD = float(os.getenv("BITMAP_AREA_THRESHOLD", "0.05")) | |
| SPARSE_TEXT_THRESHOLD = int(os.getenv("SPARSE_TEXT_THRESHOLD", "80")) | |
| IMAGE_DOMINANT_THRESHOLD = float(os.getenv("IMAGE_DOMINANT_THRESHOLD", "0.75")) | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") | |
| GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview") | |
| GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120")) | |
| GEMINI_CONCURRENCY = _env_int( | |
| "GEMINI_CONCURRENCY", | |
| max(2, min(4, max(1, CPU_COUNT // 2))), | |
| ) | |
| # Figure transcription (Pass 2.5) | |
| # Disabled by default — enabling changes markdown output, latency, and Gemini | |
| # token spend. Flip `TRANSCRIBE_IMAGES=true` in HF env after spot-checking, or | |
| # override per request via the `transcribe_images` form/JSON field. | |
| TRANSCRIBE_IMAGES = os.getenv("TRANSCRIBE_IMAGES", "false").lower() in {"1", "true", "yes"} | |
| IMAGE_TRANSCRIPTION_MIN_PX = _env_int("IMAGE_TRANSCRIPTION_MIN_PX", 150) | |
| IMAGE_TRANSCRIPTION_MIN_AREA_RATIO = _env_float( | |
| "IMAGE_TRANSCRIPTION_MIN_AREA_RATIO", 0.02 | |
| ) | |
| GEMINI_IMAGE_CONCURRENCY = _env_int( | |
| "GEMINI_IMAGE_CONCURRENCY", | |
| max(1, min(2, max(1, CPU_COUNT // 4))), | |
| ) | |
| # Hard per-request ceiling on figure transcription calls. A large PDF with | |
| # many qualifying pictures would otherwise turn into hundreds of Gemini | |
| # calls. When exceeded, the largest (by bbox area) are kept and the rest | |
| # dropped with a WARNING log. Set to 0 to disable the cap. | |
| MAX_IMAGE_TRANSCRIPTIONS = _env_int("MAX_IMAGE_TRANSCRIPTIONS", 50) | |
| # Concurrency tuning | |
| # THREAD_POOL_SIZE: default executor for blocking filesystem and pipeline | |
| # work. Keep this conservative so queued requests wait instead of turning | |
| # into CPU contention. | |
| # EXCEL_CONCURRENCY: semaphore cap on simultaneous Excel jobs. Prevents OOM | |
| # when many large workbooks arrive at once (openpyxl loads full file into RAM). | |
| THREAD_POOL_SIZE = _env_int("THREAD_POOL_SIZE", max(8, min(12, CPU_COUNT))) | |
| EXCEL_CONCURRENCY = _env_int( | |
| "EXCEL_CONCURRENCY", | |
| max(2, min(6, max(1, CPU_COUNT // 2))), | |
| ) | |
| RENDER_CONCURRENCY = _env_int( | |
| "RENDER_CONCURRENCY", | |
| max(2, min(4, max(1, CPU_COUNT // 2))), | |
| ) | |
| # PDF load shaping: queued admission control for the heaviest stage | |
| # (Docling conversion). Capacity is in abstract "work units" so a process can | |
| # run multiple small PDFs concurrently while larger PDFs naturally wait. | |
| PDF_PARSE_CAPACITY_UNITS = _env_int( | |
| "PDF_PARSE_CAPACITY_UNITS", | |
| max(1, min(2, max(1, CPU_COUNT // 4))), | |
| ) | |
| PDF_PAGES_PER_QUEUE_UNIT = _env_int("PDF_PAGES_PER_QUEUE_UNIT", 25) | |
| PDF_PARSE_QUEUE_TIMEOUT = _env_float("PDF_PARSE_QUEUE_TIMEOUT", 600.0) | |
| BLOCKED_HOSTNAMES = { | |
| "localhost", | |
| "metadata", | |
| "metadata.google.internal", | |
| "metadata.google", | |
| "169.254.169.254", | |
| "fd00:ec2::254", | |
| } | |