tommulder's picture
style: format Python files with Black
5537ceb
"""HF Dots.OCR Text Extraction Endpoint
This FastAPI application provides a Hugging Face Space endpoint for Dots.OCR
text extraction with ROI support and standardized field extraction schema.
"""
import logging
import os
import time
import uuid
import json
import re
from typing import List, Optional, Dict, Any
from contextlib import asynccontextmanager
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
# Import local modules
from .api_models import (
BoundingBox,
ExtractedField,
ExtractedFields,
MRZData,
OCRDetection,
OCRResponse,
)
from .enhanced_field_extraction import EnhancedFieldExtractor
from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
from .preprocessing import process_document, validate_file_size, get_document_info
from .response_builder import build_ocr_response, build_error_response
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Enable verbose logging globally if DOTS_OCR_DEBUG env var is set.
_env_debug = os.getenv("DOTS_OCR_DEBUG", "0").lower() in {"1", "true", "yes"}
if _env_debug:
# Elevate root logger to DEBUG to include lower-level events from submodules
logging.getLogger().setLevel(logging.DEBUG)
logger.info("DOTS_OCR_DEBUG enabled via environment — verbose logging active")
# Global model state
model_loaded = False
# FieldExtractor is now imported from the shared module
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan manager for model loading."""
global model_loaded
# Allow tests and lightweight environments to skip model loading
# Set DOTS_OCR_SKIP_MODEL_LOAD=1 to bypass heavy downloads during tests/CI
skip_model_load = os.getenv("DOTS_OCR_SKIP_MODEL_LOAD", "0") == "1"
logger.info("Loading Dots.OCR model...")
try:
if skip_model_load:
# Explicitly skip model loading for fast startup in tests/CI
model_loaded = False
logger.warning(
"DOTS_OCR_SKIP_MODEL_LOAD=1 set - skipping model load (mock mode)"
)
else:
# Load the model using the new model loader
load_model()
model_loaded = True
logger.info("Dots.OCR model loaded successfully")
# Log model information
model_info = get_model_info()
logger.info(f"Model info: {model_info}")
except Exception as e:
logger.error(f"Failed to load Dots.OCR model: {e}")
# Don't raise - allow mock mode for development
model_loaded = False
logger.warning("Model loading failed - using mock implementation")
yield
logger.info("Shutting down Dots.OCR endpoint...")
app = FastAPI(
title="KYB Dots.OCR Text Extraction",
description="Dots.OCR for identity document text extraction with ROI support",
version="1.0.0",
lifespan=lifespan,
)
@app.get("/")
async def root():
"""Root route for uptime checks."""
return {"status": "ok"}
@app.get("/health")
async def health_check():
"""Health check endpoint."""
global model_loaded
status = "healthy" if model_loaded else "degraded"
model_info = get_model_info() if model_loaded else None
return {
"status": status,
"version": "1.0.0",
"model_loaded": model_loaded,
"model_info": model_info,
}
@app.post("/v1/id/ocr", response_model=OCRResponse)
async def extract_text_endpoint(
file: UploadFile = File(..., description="Image or PDF file to process"),
roi: Optional[str] = Form(None, description="ROI coordinates as JSON string"),
debug: Optional[bool] = Form(
None,
description=(
"Enable verbose debug logging for this request. Overrides env when True."
),
),
):
"""Extract text from identity document image or PDF."""
global model_loaded
# Allow mock mode when model isn't loaded to support tests/CI and dev flows
allow_mock = os.getenv("DOTS_OCR_ALLOW_MOCK", "1") == "1"
is_mock_mode = (not model_loaded) and allow_mock
if not model_loaded and not allow_mock:
raise HTTPException(status_code=503, detail="Model not loaded")
# Determine effective debug mode for this request
env_debug = os.getenv("DOTS_OCR_DEBUG", "0").lower() in {"1", "true", "yes"}
debug_enabled = bool(debug) if debug is not None else env_debug
if debug_enabled:
logger.info(
f"[debug] Request {request_id}: debug logging enabled (env={env_debug}, form={debug})"
)
if is_mock_mode:
logger.warning(
"Using mock mode — OCR text will be empty. To enable real inference, ensure the model loads successfully (unset DOTS_OCR_SKIP_MODEL_LOAD and provide resources)."
)
start_time = time.time()
request_id = str(uuid.uuid4())
try:
# Read file data
file_data = await file.read()
# Validate file size
if not validate_file_size(file_data):
raise HTTPException(status_code=413, detail="File size exceeds limit")
# Get document information
doc_info = get_document_info(file_data)
logger.info(f"Processing document: {doc_info}")
# Parse ROI if provided
roi_coords = None
if roi:
try:
roi_data = json.loads(roi)
roi_bbox = BoundingBox(**roi_data)
roi_coords = (roi_bbox.x1, roi_bbox.y1, roi_bbox.x2, roi_bbox.y2)
logger.info(f"Using ROI: {roi_coords}")
except Exception as e:
logger.warning(f"Invalid ROI provided: {e}")
raise HTTPException(status_code=400, detail=f"Invalid ROI format: {e}")
# Process document (PDF to images or single image)
try:
processed_images = process_document(file_data, roi_coords)
logger.info(f"Processed {len(processed_images)} images from document")
except Exception as e:
logger.error(f"Document processing failed: {e}")
raise HTTPException(
status_code=400, detail=f"Document processing failed: {e}"
)
# Process each image and extract text
ocr_texts = []
page_metadata = []
for i, image in enumerate(processed_images):
try:
# Extract text using the loaded model, or produce mock output in mock mode
if is_mock_mode:
# In mock mode, we skip model inference and return empty text
ocr_text = ""
else:
ocr_text = extract_text(image)
logger.info(
f"Page {i + 1} - Extracted text length: {len(ocr_text)} characters"
)
ocr_texts.append(ocr_text)
# Collect page metadata
page_meta = {
"page_index": i,
"image_size": image.size,
"text_length": len(ocr_text),
"processing_successful": True,
}
page_metadata.append(page_meta)
except Exception as e:
logger.error(f"Text extraction failed for page {i + 1}: {e}")
# Add empty text for failed page
ocr_texts.append("")
page_meta = {
"page_index": i,
"image_size": image.size if hasattr(image, "size") else (0, 0),
"text_length": 0,
"processing_successful": False,
"error": str(e),
}
page_metadata.append(page_meta)
# Determine media type for response
media_type = "pdf" if doc_info["is_pdf"] else "image"
processing_time = time.time() - start_time
# Build response using the response builder
return build_ocr_response(
request_id=request_id,
media_type=media_type,
processing_time=processing_time,
ocr_texts=ocr_texts,
page_metadata=page_metadata,
debug=debug_enabled,
)
except HTTPException:
# Re-raise HTTP exceptions as-is
raise
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
processing_time = time.time() - start_time
error_response = build_error_response(
request_id=request_id,
error_message=f"OCR extraction failed: {str(e)}",
processing_time=processing_time,
)
raise HTTPException(status_code=500, detail=error_response.dict())
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)