Spaces:

make789
/

OCRdeepSeekService

Sleeping

App Files Files Community

make789 commited on Oct 29

Commit

20d4651

verified ·

1 Parent(s): 855f6ac

Upload ocr_service.py

Browse files

Files changed (1) hide show

ocr_service.py +393 -7

ocr_service.py CHANGED Viewed

@@ -11,8 +11,9 @@ from time import monotonic
 from typing import Any, Deque, DefaultDict, Optional
 import numpy as np
-from fastapi import Depends, FastAPI, Form, HTTPException, Request, UploadFile, status
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import APIKeyHeader
 from PIL import Image
@@ -127,6 +128,11 @@ _ocr_model = None
 _ocr_tokenizer = None
 _model_lock = asyncio.Lock()
 def _download_and_patch_model_locally(model_id: str, revision: str) -> str:
     """
@@ -274,16 +280,37 @@ async def get_ocr_model():
 async def run_deepseek_ocr(
     image_path: str,
     prompt: str = "<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
-    use_grounding: bool = True
 ) -> dict:
     """
     Run DeepSeek-OCR on an image file with advanced grounding support.
     """
     model, tokenizer = await get_ocr_model()
     output_path = tempfile.mkdtemp()
     try:
         # OCR quality settings - Gundam preset recommended for CPU/Spaces
         torch = _get_torch()
         if USE_GPU and torch.cuda.is_available():
@@ -296,9 +323,23 @@ async def run_deepseek_ocr(
             actual_image_size = 640
             print(f"  - Using CPU-optimized quality: base_size={actual_base_size}, image_size={actual_image_size}")
         # Use torch.inference_mode() to reduce overhead on CPU
         torch = _get_torch()
         with torch.inference_mode():
             result = model.infer(
                 tokenizer,
                 prompt=prompt,
@@ -311,6 +352,17 @@ async def run_deepseek_ocr(
                 test_compress=False,
             )
         # Parse result - DeepSeek-OCR returns structured markdown output
         raw_text = result if isinstance(result, str) else str(result)
@@ -318,12 +370,69 @@ async def run_deepseek_ocr(
         # This parses grounding annotations to get bounding boxes
         lines = _parse_deepseek_output(raw_text)
         # Convert to clean markdown (remove tags, keep text)
         clean_markdown = _deepseek_to_markdown(raw_text)
         return {
             "text": clean_markdown,  # Return clean markdown without tags
             "lines": lines,  # Structured lines with bounding boxes
         }
     except Exception as e:
         print(f"DeepSeek-OCR error: {e}")
@@ -343,6 +452,38 @@ async def run_deepseek_ocr(
             pass
 def _deepseek_to_markdown(s: str) -> str:
     """
     Convert DeepSeek-OCR tagged output to clean Markdown.
@@ -750,10 +891,45 @@ async def predict_options():
 @app.post("/api/predict")  # HuggingFace Spaces may auto-route POST requests here
 async def ocr_page(
     file: UploadFile,
     _: None = Depends(enforce_rate_limit),
 ):
-    """OCR endpoint using DeepSeek-OCR"""
     img, img_path = await load_img(file)
     try:
         # Save PIL image to temporary file for DeepSeek-OCR
         with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
@@ -761,18 +937,69 @@ async def ocr_page(
             tmp_img_path = tmp_file.name
         try:
             # Use grounding prompt for better structure extraction
             result = await run_deepseek_ocr(
                 tmp_img_path,
                 prompt="<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
-                use_grounding=True
             )
-            return result
         except Exception as e:
-            # Log the error but don't crash - return a helpful error message
             error_msg = str(e)
             print(f"OCR processing error: {error_msg}")
             # Check if it's a model loading issue
             if "matplotlib" in error_msg or "torchvision" in error_msg or "ImportError" in error_msg:
                 raise HTTPException(
@@ -797,6 +1024,165 @@ async def ocr_page(
             os.unlink(img_path)
 @app.post("/split")
 async def split(
     file: UploadFile,
@@ -829,7 +1215,7 @@ async def split(
             try:
                 # Use DeepSeek-OCR with grounding prompt for better structured extraction
                 prompt = "<image>\n<|grounding|>Convert the document region to markdown with preserved layout."
-                ocr_result = await run_deepseek_ocr(crop_path, prompt=prompt, use_grounding=True)
                 # Parse OCR result to extract lines
                 child_lines = ocr_result.get("lines", [])

 from typing import Any, Deque, DefaultDict, Optional
 import numpy as np
+from fastapi import Depends, FastAPI, Form, HTTPException, Request, UploadFile, status, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from fastapi.security import APIKeyHeader
 from PIL import Image
 _ocr_tokenizer = None
 _model_lock = asyncio.Lock()
+# Job management for async processing and cancellation
+_jobs: dict[str, dict] = {}  # job_id -> {status, progress, result, error, cancelled}
+_jobs_lock = asyncio.Lock()
+_cancellation_tokens: dict[str, asyncio.Event] = {}  # job_id -> cancellation event
 def _download_and_patch_model_locally(model_id: str, revision: str) -> str:
     """
 async def run_deepseek_ocr(
     image_path: str,
     prompt: str = "<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
+    use_grounding: bool = True,
+    job_id: Optional[str] = None,
+    progress_callback = None,
+    detect_fields: bool = True
 ) -> dict:
     """
     Run DeepSeek-OCR on an image file with advanced grounding support.
+    Supports cancellation via job_id and progress updates via callback.
+    If detect_fields=True, also runs locator queries to detect specific fields:
+    - Recipe title
+    - Ingredients list
+    - Instructions/steps
+    Returns additional 'field_boxes' with highlighted locations.
     """
+    # Check for cancellation before starting
+    if job_id:
+        async with _jobs_lock:
+            cancel_event = _cancellation_tokens.get(job_id)
+            if cancel_event and cancel_event.is_set():
+                raise asyncio.CancelledError(f"Job {job_id} was cancelled")
     model, tokenizer = await get_ocr_model()
     output_path = tempfile.mkdtemp()
     try:
+        # Update progress: Preprocessing (0-10%)
+        if progress_callback:
+            await progress_callback(0.05, "Preprocessing image...")
         # OCR quality settings - Gundam preset recommended for CPU/Spaces
         torch = _get_torch()
         if USE_GPU and torch.cuda.is_available():
             actual_image_size = 640
             print(f"  - Using CPU-optimized quality: base_size={actual_base_size}, image_size={actual_image_size}")
+        # Check for cancellation before inference
+        if job_id:
+            async with _jobs_lock:
+                cancel_event = _cancellation_tokens.get(job_id)
+                if cancel_event and cancel_event.is_set():
+                    raise asyncio.CancelledError(f"Job {job_id} was cancelled")
+        # Update progress: Starting inference (10-90%)
+        if progress_callback:
+            await progress_callback(0.10, "Starting OCR inference...")
         # Use torch.inference_mode() to reduce overhead on CPU
+        # Note: We can't interrupt inference mid-process, but we can check before/after
         torch = _get_torch()
         with torch.inference_mode():
+            # Estimate inference takes ~80% of time (10-90%)
+            # We'll update progress during post-processing
             result = model.infer(
                 tokenizer,
                 prompt=prompt,
                 test_compress=False,
             )
+        # Check for cancellation after inference
+        if job_id:
+            async with _jobs_lock:
+                cancel_event = _cancellation_tokens.get(job_id)
+                if cancel_event and cancel_event.is_set():
+                    raise asyncio.CancelledError(f"Job {job_id} was cancelled")
+        # Update progress: Post-processing (90-95%)
+        if progress_callback:
+            await progress_callback(0.90, "Parsing OCR results...")
         # Parse result - DeepSeek-OCR returns structured markdown output
         raw_text = result if isinstance(result, str) else str(result)
         # This parses grounding annotations to get bounding boxes
         lines = _parse_deepseek_output(raw_text)
+        # Update progress: Cleaning output (95-98%)
+        if progress_callback:
+            await progress_callback(0.95, "Cleaning output...")
         # Convert to clean markdown (remove tags, keep text)
         clean_markdown = _deepseek_to_markdown(raw_text)
+        # Detect specific fields using locator pattern if requested
+        field_boxes = {}
+        if detect_fields:
+            if progress_callback:
+                await progress_callback(0.96, "Detecting recipe fields...")
+            # Define field detection prompts using locator pattern
+            field_prompts = {
+                "title": "<image>\nLocate <|ref|>Recipe title<|/ref|> in the image.",
+                "ingredients": "<image>\nLocate <|ref|>Ingredients list<|/ref|> in the image.",
+                "instructions": "<image>\nLocate <|ref|>Instructions or steps<|/ref|> in the image.",
+                "quantity": "<image>\nLocate <|ref|>Total amount or servings<|/ref|> in the image.",
+                "cooking_time": "<image>\nLocate <|ref|>Cooking time or prep time<|/ref|> in the image.",
+            }
+            torch = _get_torch()
+            for field_name, locator_prompt in field_prompts.items():
+                try:
+                    # Check for cancellation
+                    if job_id:
+                        async with _jobs_lock:
+                            cancel_event = _cancellation_tokens.get(job_id)
+                            if cancel_event and cancel_event.is_set():
+                                break
+                    # Run locator query for this field
+                    with torch.inference_mode():
+                        locator_result = model.infer(
+                            tokenizer,
+                            prompt=locator_prompt,
+                            image_file=image_path,
+                            output_path=output_path,
+                            base_size=actual_base_size,
+                            image_size=actual_image_size,
+                            crop_mode=CROP_MODE,
+                            save_results=False,
+                            test_compress=False,
+                        )
+                    # Parse locator boxes from result
+                    locator_text = locator_result if isinstance(locator_result, str) else str(locator_result)
+                    locator_boxes = _parse_locator_boxes(locator_text, field_name)
+                    if locator_boxes:
+                        field_boxes[field_name] = locator_boxes
+                except Exception as e:
+                    print(f"  ⚠️ Field detection for {field_name} failed: {e}")
+                    continue  # Continue with other fields
+        # Update progress: Done (100%)
+        if progress_callback:
+            await progress_callback(1.0, "Complete")
         return {
             "text": clean_markdown,  # Return clean markdown without tags
             "lines": lines,  # Structured lines with bounding boxes
+            "field_boxes": field_boxes if detect_fields else {},  # Field-specific highlight boxes
         }
     except Exception as e:
         print(f"DeepSeek-OCR error: {e}")
             pass
+def _parse_locator_boxes(locator_text: str, field_name: str) -> list:
+    """
+    Parse bounding boxes from locator pattern output.
+    Locator returns: <|ref|>FIELD_NAME<|/ref|><|det|>[x1,y1,x2,y2]<|/det|>
+    """
+    import re
+    boxes = []
+    # Pattern: <|ref|>FIELD<|/ref|><|det|>[x1,y1,x2,y2]<|/det|>
+    # Note: Locator uses [x1,y1,x2,y2] format (not [x,y,w,h])
+    locator_pattern = re.compile(
+        r'<\|ref\|>[^<]*<\|\/ref\|><\|det\|>\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]<\|\/det\|>',
+        re.DOTALL
+    )
+    for match in locator_pattern.finditer(locator_text):
+        x1 = int(match.group(1))
+        y1 = int(match.group(2))
+        x2 = int(match.group(3))
+        y2 = int(match.group(4))
+        # Convert to [x0, y0, x1, y1] format (top-left to bottom-right)
+        boxes.append({
+            "bbox": [x1, y1, x2, y2],
+            "field": field_name,
+            "confidence": 0.95
+        })
+    return boxes
 def _deepseek_to_markdown(s: str) -> str:
     """
     Convert DeepSeek-OCR tagged output to clean Markdown.
 @app.post("/api/predict")  # HuggingFace Spaces may auto-route POST requests here
 async def ocr_page(
     file: UploadFile,
+    job_id: Optional[str] = Form(None),
+    background_tasks: BackgroundTasks = None,
     _: None = Depends(enforce_rate_limit),
 ):
+    """OCR endpoint using DeepSeek-OCR - supports async job processing with SSE streaming"""
+    # Import progress bus
+    try:
+        from progress_bus import bus
+    except ImportError:
+        # Fallback if progress_bus not available
+        bus = None
+    # Generate job_id if not provided
+    if not job_id:
+        if bus:
+            job_id = bus.new_job()
+        else:
+            job_id = secrets.token_urlsafe(16)
+    # Initialize job status (for polling compatibility)
+    async with _jobs_lock:
+        _jobs[job_id] = {
+            "status": "processing",
+            "progress": 0.0,
+            "message": "Initializing...",
+            "result": None,
+            "error": None
+        }
+        _cancellation_tokens[job_id] = asyncio.Event()
+    # Start background task for async processing
+    if background_tasks and bus:
+        # Async mode: return job_id immediately, process in background
+        background_tasks.add_task(run_ocr_job_async, job_id, file, bus)
+        return {"job_id": job_id, "status": "processing", "message": "Job started - use /progress/{job_id} for SSE or /jobs/{job_id}/status for polling"}
+    # Synchronous mode: process immediately
     img, img_path = await load_img(file)
     try:
         # Save PIL image to temporary file for DeepSeek-OCR
         with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
             tmp_img_path = tmp_file.name
         try:
+            # Progress callback to update job status (async-safe)
+            async def update_progress(progress: float, message: str):
+                async with _jobs_lock:
+                    if job_id in _jobs:
+                        _jobs[job_id]["progress"] = progress
+                        _jobs[job_id]["message"] = message
+                # Also send to SSE bus if available
+                if bus:
+                    await bus.send(job_id, pct=progress * 100, stage=message.lower().replace(" ", "_"))
+            # Start OCR processing (can be cancelled)
+            await update_progress(0.0, "Starting OCR...")
+            # Check for cancellation before processing
+            cancel_event = _cancellation_tokens.get(job_id)
+            if cancel_event and cancel_event.is_set():
+                async with _jobs_lock:
+                    _jobs[job_id]["status"] = "cancelled"
+                    _jobs[job_id]["message"] = "Job was cancelled"
+                raise HTTPException(status_code=499, detail="Job was cancelled")
             # Use grounding prompt for better structure extraction
             result = await run_deepseek_ocr(
                 tmp_img_path,
                 prompt="<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
+                use_grounding=True,
+                job_id=job_id,
+                progress_callback=update_progress
             )
+            # Update job with result
+            async with _jobs_lock:
+                if job_id in _jobs:
+                    _jobs[job_id]["status"] = "completed"
+                    _jobs[job_id]["progress"] = 1.0
+                    _jobs[job_id]["result"] = result
+                    _jobs[job_id]["message"] = "Complete"
+            # Finalize SSE stream if available
+            if bus:
+                await bus.finalize(job_id, pct=100, stage="done", **result)
+            return {"job_id": job_id, **result}
+        except asyncio.CancelledError as e:
+            # Job was cancelled
+            async with _jobs_lock:
+                if job_id in _jobs:
+                    _jobs[job_id]["status"] = "cancelled"
+                    _jobs[job_id]["message"] = "Job was cancelled"
+                _cancellation_tokens.pop(job_id, None)
+            raise HTTPException(status_code=499, detail="Job was cancelled")
         except Exception as e:
+            # Log the error and update job status
             error_msg = str(e)
             print(f"OCR processing error: {error_msg}")
+            async with _jobs_lock:
+                if job_id in _jobs:
+                    _jobs[job_id]["status"] = "failed"
+                    _jobs[job_id]["error"] = error_msg
+                    _jobs[job_id]["message"] = f"Error: {error_msg}"
             # Check if it's a model loading issue
             if "matplotlib" in error_msg or "torchvision" in error_msg or "ImportError" in error_msg:
                 raise HTTPException(
             os.unlink(img_path)
+async def run_ocr_job_async(job_id: str, file: UploadFile, bus):
+    """Background task to run OCR job with SSE updates"""
+    img_path = None
+    tmp_img_path = None
+    try:
+        # Update progress: Decode (0-5%)
+        await bus.send(job_id, pct=1, stage="queued")
+        img, img_path = await load_img(file)
+        await bus.send(job_id, pct=5, stage="decode")
+        # Save PIL image to temporary file for DeepSeek-OCR
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+            img.save(tmp_file, 'JPEG', quality=95)
+            tmp_img_path = tmp_file.name
+        # Update progress: Preprocess (5-20%)
+        async with _jobs_lock:
+            if job_id not in _jobs:
+                return  # Job was cancelled before starting
+            _jobs[job_id]["progress"] = 0.05
+            _jobs[job_id]["message"] = "Preprocessing image..."
+        await bus.send(job_id, pct=20, stage="preprocess")
+        # Progress callback that updates both job status and SSE
+        async def update_progress(progress: float, message: str):
+            # Update job status
+            async with _jobs_lock:
+                if job_id in _jobs:
+                    _jobs[job_id]["progress"] = progress
+                    _jobs[job_id]["message"] = message
+            # Send to SSE stream
+            pct = progress * 100
+            stage_map = {
+                "preprocessing": "preprocess",
+                "starting ocr inference": "encoding",
+                "parsing ocr results": "postprocess",
+                "cleaning output": "postprocess",
+                "complete": "done"
+            }
+            stage = stage_map.get(message.lower(), message.lower().replace(" ", "_"))
+            await bus.send(job_id, pct=pct, stage=stage, msg=message)
+        # Check for cancellation
+        async with _jobs_lock:
+            cancel_event = _cancellation_tokens.get(job_id)
+            if cancel_event and cancel_event.is_set():
+                await bus.error(job_id, "Job was cancelled")
+                return
+        # Run OCR
+        result = await run_deepseek_ocr(
+            tmp_img_path,
+            prompt="<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
+            use_grounding=True,
+            job_id=job_id,
+            progress_callback=update_progress
+        )
+        # Update job status
+        async with _jobs_lock:
+            if job_id in _jobs:
+                _jobs[job_id]["status"] = "completed"
+                _jobs[job_id]["progress"] = 1.0
+                _jobs[job_id]["result"] = result
+                _jobs[job_id]["message"] = "Complete"
+        # Finalize SSE stream
+        await bus.finalize(job_id, pct=100, stage="done", **result)
+    except asyncio.CancelledError:
+        async with _jobs_lock:
+            if job_id in _jobs:
+                _jobs[job_id]["status"] = "cancelled"
+                _jobs[job_id]["message"] = "Job was cancelled"
+        await bus.error(job_id, "Job was cancelled")
+    except Exception as e:
+        error_msg = str(e)
+        async with _jobs_lock:
+            if job_id in _jobs:
+                _jobs[job_id]["status"] = "failed"
+                _jobs[job_id]["error"] = error_msg
+                _jobs[job_id]["message"] = f"Error: {error_msg}"
+        await bus.error(job_id, error_msg)
+    finally:
+        # Cleanup temp files
+        if tmp_img_path and os.path.exists(tmp_img_path):
+            os.unlink(tmp_img_path)
+        if img_path and os.path.exists(img_path):
+            os.unlink(img_path)
+@app.get("/progress/{job_id}")
+async def get_progress_stream(job_id: str):
+    """SSE stream for real-time OCR progress updates"""
+    try:
+        from progress_bus import bus
+    except ImportError:
+        raise HTTPException(status_code=503, detail="SSE streaming not available")
+    return StreamingResponse(
+        bus.stream(job_id),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",  # Disable nginx buffering
+        }
+    )
+@app.get("/jobs/{job_id}/status")
+async def get_job_status(job_id: str):
+    """Get status of an OCR job (polling endpoint)"""
+    async with _jobs_lock:
+        if job_id not in _jobs:
+            raise HTTPException(status_code=404, detail="Job not found")
+        job = _jobs[job_id]
+        return {
+            "job_id": job_id,
+            "status": job["status"],  # processing, completed, failed, cancelled
+            "progress": job["progress"],  # 0.0 to 1.0
+            "message": job["message"],
+            "result": job.get("result"),
+            "error": job.get("error")
+        }
+@app.post("/jobs/{job_id}/cancel")
+async def cancel_job(job_id: str):
+    """Cancel a running OCR job"""
+    async with _jobs_lock:
+        if job_id not in _jobs:
+            raise HTTPException(status_code=404, detail="Job not found")
+        job = _jobs[job_id]
+        if job["status"] in ("completed", "failed", "cancelled"):
+            return {"message": f"Job already {job['status']}"}
+        # Set cancellation flag
+        if job_id in _cancellation_tokens:
+            _cancellation_tokens[job_id].set()
+        job["status"] = "cancelling"
+        job["message"] = "Cancellation requested..."
+        # Send cancellation to SSE stream
+        try:
+            from progress_bus import bus
+            await bus.error(job_id, "Cancellation requested")
+        except ImportError:
+            pass
+        return {"message": "Cancellation requested", "job_id": job_id}
 @app.post("/split")
 async def split(
     file: UploadFile,
             try:
                 # Use DeepSeek-OCR with grounding prompt for better structured extraction
                 prompt = "<image>\n<|grounding|>Convert the document region to markdown with preserved layout."
+                ocr_result = await run_deepseek_ocr(crop_path, prompt=prompt, use_grounding=True, detect_fields=False)
                 # Parse OCR result to extract lines
                 child_lines = ocr_result.get("lines", [])