Spaces:

devappsmi
/

document_parse

Sleeping

App Files Files Community

devappsmi commited on 19 days ago

Commit

3771183

verified ·

1 Parent(s): 84ef6a4

Update app.py

Browse files

Files changed (1) hide show

app.py +288 -97

app.py CHANGED Viewed

@@ -1,19 +1,16 @@
 """
 PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
 ====================================================
-Deploys on Hugging Face Spaces as a FastAPI app.
-Connects to vLLM Docker running on your GPU server.
 Architecture:
-    Gradio App (another HF Space or any client)
-         |
-    This HF Space (Bridge, port 7860)
-         |
-    Your GPU Server (vLLM Docker, 117.54.141.62:8000)
-HF Space Settings → Variables and secrets:
-    VLLM_SERVER_URL = http://117.54.141.62:8000/v1
-    API_KEY = (optional, for auth)
 """
 import base64
@@ -23,13 +20,14 @@ import shutil
 import tempfile
 import traceback
 import uuid
-from typing import Any, Dict, Optional
 import uvicorn
 from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from openai import OpenAI
 # =============================================================================
 # Configuration
@@ -38,19 +36,18 @@ VLLM_SERVER_URL = os.environ.get("VLLM_SERVER_URL", "http://117.54.141.62:8000/v
 VLLM_MODEL_NAME = os.environ.get("VLLM_MODEL_NAME", "PaddleOCR-VL-1.5-0.9B")
 BRIDGE_PORT = int(os.environ.get("PORT", "7860"))
 API_KEY = os.environ.get("API_KEY", "")
-# Public base URL for serving static files (auto-detect from HF Space)
 SPACE_HOST = os.environ.get("SPACE_HOST", "")
 if SPACE_HOST:
     PUBLIC_BASE_URL = f"https://{SPACE_HOST}"
 else:
     PUBLIC_BASE_URL = os.environ.get("PUBLIC_BASE_URL", f"http://localhost:{BRIDGE_PORT}")
-# Directory to store and serve output images
 STATIC_DIR = "/tmp/ocr_outputs"
 os.makedirs(STATIC_DIR, exist_ok=True)
 # =============================================================================
-# Initialize OpenAI client
 # =============================================================================
 openai_client = OpenAI(
     api_key="EMPTY",
@@ -58,14 +55,10 @@ openai_client = OpenAI(
     timeout=600
 )
-# =============================================================================
-# PaddleOCR pipeline
-# =============================================================================
 pipeline = None
 def get_pipeline():
-    """Lazy-load the PaddleOCR pipeline."""
     global pipeline
     if pipeline is None:
         from paddleocr import PaddleOCRVL
@@ -81,7 +74,7 @@ def get_pipeline():
 # =============================================================================
 app = FastAPI(
     title="PaddleOCR-VL-1.5 Bridge API",
-    description="Full document parsing API — bridge between Gradio UI and vLLM server",
     version="1.0.0"
 )
@@ -93,7 +86,6 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Serve static files (output images)
 app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
@@ -122,7 +114,6 @@ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
 def save_temp_image(file_data: str) -> str:
-    """Save base64 or URL image to temp file."""
     if file_data.startswith(("http://", "https://")):
         import requests as req
         resp = req.get(file_data, timeout=120)
@@ -146,139 +137,338 @@ def save_temp_image(file_data: str) -> str:
     return tmp.name
-def collect_output_images(output_dir: str, request_id: str) -> Dict[str, str]:
-    """
-    Find all image files in the output directory,
-    copy them to the static dir, and return a dict of {name: public_url}.
-    """
-    output_images = {}
-    if not os.path.exists(output_dir):
-        return output_images
-    # Create a subdirectory for this request
     static_subdir = os.path.join(STATIC_DIR, request_id)
     os.makedirs(static_subdir, exist_ok=True)
-    for root, dirs, files in os.walk(output_dir):
-        for filename in files:
-            ext = os.path.splitext(filename)[1].lower()
             if ext in IMAGE_EXTENSIONS:
-                src_path = os.path.join(root, filename)
-                dst_path = os.path.join(static_subdir, filename)
-                shutil.copy2(src_path, dst_path)
-                public_url = f"{PUBLIC_BASE_URL}/static/{request_id}/{filename}"
-                output_images[filename] = public_url
-    return output_images
-def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
-    """Element-level recognition via direct vLLM call."""
-    if file_data.startswith(("http://", "https://")):
-        image_url = file_data
-    else:
-        image_url = f"data:image/png;base64,{file_data}"
-    task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
-    response = openai_client.chat.completions.create(
-        model=VLLM_MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": task_prompt}
-            ]
-        }],
-        temperature=0.0
-    )
-    result_text = response.choices[0].message.content
-    return {
-        "errorCode": 0,
-        "result": {
-            "layoutParsingResults": [{
-                "markdown": {"text": result_text, "images": {}},
-                "outputImages": {},
-                "prunedResult": {
-                    "spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
-                }
-            }]
         }
-    }
 def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                           use_doc_unwarping: bool = True,
                           use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
-    """Full document parsing with layout detection + VLM recognition."""
     tmp_path = save_temp_image(file_data)
     request_id = str(uuid.uuid4())[:12]
     try:
         pipe = get_pipeline()
         output = pipe.predict(tmp_path)
-        results = []
         for i, res in enumerate(output):
             output_dir = tempfile.mkdtemp()
-            # Save all outputs (json, markdown, images)
             res.save_to_json(save_path=output_dir)
             res.save_to_markdown(save_path=output_dir)
-            # Try to save visualization image
             try:
                 res.save_to_img(save_path=output_dir)
             except Exception:
                 pass
-            # Read markdown
             md_text = ""
             md_files = [f for f in os.listdir(output_dir) if f.endswith(".md")]
             if md_files:
                 with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
                     md_text = f.read()
-            # Read JSON
             json_data = {}
             json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
             if json_files:
                 with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
                     json_data = json.load(f)
-            # Collect and serve output images
-            page_request_id = f"{request_id}_page{i}"
-            output_images = collect_output_images(output_dir, page_request_id)
-            # Also check for images referenced in markdown
-            md_images = {}
-            for fname, url in output_images.items():
-                # Replace local paths in markdown with public URLs
-                md_text = md_text.replace(fname, url)
-                md_images[fname] = url
-            results.append({
-                "markdown": {"text": md_text, "images": md_images},
                 "outputImages": output_images,
-                "jsonData": json_data
             })
         return {
             "errorCode": 0,
             "result": {
-                "layoutParsingResults": results if results else [{
                     "markdown": {"text": "", "images": {}},
-                    "outputImages": {}
-                }]
             }
         }
     finally:
         if os.path.exists(tmp_path):
             os.unlink(tmp_path)
 def _parse_spotting(text: str) -> dict:
     try:
         return json.loads(text)
@@ -308,6 +498,7 @@ async def health():
 async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
     """
     Main OCR endpoint — compatible with the Gradio app.
     Body:
     {

 """
 PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
 ====================================================
+Returns full JSON response matching the official Baidu API format, including:
+- layoutParsingResults[].prunedResult (blocks, labels, bboxes, polygon points)
+- layoutParsingResults[].markdown (text + images)
+- layoutParsingResults[].outputImages (visualization URLs)
+- layoutParsingResults[].inputImage
+- preprocessedImages
+- dataInfo
 Architecture:
+    Gradio App → This Bridge (port 7860) → vLLM Docker (117.54.141.62:8000)
 """
 import base64
 import tempfile
 import traceback
 import uuid
+from typing import Any, Dict, List, Optional
 import uvicorn
 from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from openai import OpenAI
+from PIL import Image
 # =============================================================================
 # Configuration
 VLLM_MODEL_NAME = os.environ.get("VLLM_MODEL_NAME", "PaddleOCR-VL-1.5-0.9B")
 BRIDGE_PORT = int(os.environ.get("PORT", "7860"))
 API_KEY = os.environ.get("API_KEY", "")
 SPACE_HOST = os.environ.get("SPACE_HOST", "")
 if SPACE_HOST:
     PUBLIC_BASE_URL = f"https://{SPACE_HOST}"
 else:
     PUBLIC_BASE_URL = os.environ.get("PUBLIC_BASE_URL", f"http://localhost:{BRIDGE_PORT}")
 STATIC_DIR = "/tmp/ocr_outputs"
 os.makedirs(STATIC_DIR, exist_ok=True)
 # =============================================================================
+# Initialize clients
 # =============================================================================
 openai_client = OpenAI(
     api_key="EMPTY",
     timeout=600
 )
 pipeline = None
 def get_pipeline():
     global pipeline
     if pipeline is None:
         from paddleocr import PaddleOCRVL
 # =============================================================================
 app = FastAPI(
     title="PaddleOCR-VL-1.5 Bridge API",
+    description="Full document parsing API matching official Baidu API format",
     version="1.0.0"
 )
     allow_headers=["*"],
 )
 app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
 def save_temp_image(file_data: str) -> str:
     if file_data.startswith(("http://", "https://")):
         import requests as req
         resp = req.get(file_data, timeout=120)
     return tmp.name
+def serve_file(src_path: str, request_id: str, filename: str) -> str:
+    """Copy a file to the static dir and return its public URL."""
     static_subdir = os.path.join(STATIC_DIR, request_id)
     os.makedirs(static_subdir, exist_ok=True)
+    dst_path = os.path.join(static_subdir, filename)
+    shutil.copy2(src_path, dst_path)
+    return f"{PUBLIC_BASE_URL}/static/{request_id}/{filename}"
+def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
+    """Find all images in a directory and serve them. Returns {filename: url}."""
+    result = {}
+    if not os.path.exists(directory):
+        return result
+    for root, dirs, files in os.walk(directory):
+        for fname in files:
+            ext = os.path.splitext(fname)[1].lower()
             if ext in IMAGE_EXTENSIONS:
+                src = os.path.join(root, fname)
+                # Preserve subdirectory structure in the filename
+                rel_path = os.path.relpath(src, directory)
+                safe_name = rel_path.replace(os.sep, "_")
+                url = serve_file(src, request_id, safe_name)
+                result[rel_path] = url
+    return result
+def extract_pruned_result(res_obj, page_index: int = 0) -> Dict[str, Any]:
+    """
+    Extract the full prunedResult from a PaddleOCR result object,
+    matching the official Baidu API format.
+    """
+    pruned = {}
+    try:
+        # Try to get the raw dict/json from the result object
+        if hasattr(res_obj, 'json'):
+            raw = res_obj.json if isinstance(res_obj.json, dict) else {}
+        elif hasattr(res_obj, '_result'):
+            raw = res_obj._result if isinstance(res_obj._result, dict) else {}
+        elif hasattr(res_obj, 'to_dict'):
+            raw = res_obj.to_dict()
+        else:
+            raw = {}
+        # Try multiple attribute paths to find the parsing results
+        parsing_res_list = []
+        layout_det_res = {"boxes": []}
+        # Check common attribute names
+        for attr in ['parsing_res_list', 'parsing_result', 'blocks']:
+            if hasattr(res_obj, attr):
+                parsing_res_list = getattr(res_obj, attr, [])
+                break
+        # Check for layout detection results
+        for attr in ['layout_det_res', 'layout_result', 'det_res']:
+            if hasattr(res_obj, attr):
+                layout_det_res = getattr(res_obj, attr, {})
+                break
+        # Get image dimensions
+        width = 0
+        height = 0
+        for attr in ['img_width', 'width']:
+            if hasattr(res_obj, attr):
+                width = getattr(res_obj, attr, 0)
+                break
+        for attr in ['img_height', 'height']:
+            if hasattr(res_obj, attr):
+                height = getattr(res_obj, attr, 0)
+                break
+        # If we got raw dict, try to extract from it
+        if raw and not parsing_res_list:
+            parsing_res_list = raw.get('parsing_res_list', raw.get('blocks', []))
+            layout_det_res = raw.get('layout_det_res', {"boxes": []})
+            width = raw.get('width', width)
+            height = raw.get('height', height)
+        pruned = {
+            "page_count": 1,
+            "width": width,
+            "height": height,
+            "model_settings": {
+                "use_doc_preprocessor": False,
+                "use_layout_detection": True,
+                "use_chart_recognition": False,
+                "use_seal_recognition": True,
+                "use_ocr_for_image_block": False,
+                "format_block_content": True,
+                "merge_layout_blocks": True,
+                "markdown_ignore_labels": [
+                    "number", "footnote", "header",
+                    "header_image", "footer", "footer_image", "aside_text"
+                ],
+                "return_layout_polygon_points": True
+            },
+            "parsing_res_list": parsing_res_list if isinstance(parsing_res_list, list) else [],
+            "layout_det_res": layout_det_res if isinstance(layout_det_res, dict) else {"boxes": []}
+        }
+    except Exception as e:
+        print(f"Warning: Could not extract prunedResult: {e}")
+        traceback.print_exc()
+        pruned = {
+            "page_count": 1,
+            "width": 0,
+            "height": 0,
+            "model_settings": {},
+            "parsing_res_list": [],
+            "layout_det_res": {"boxes": []}
         }
+    return pruned
 def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                           use_doc_unwarping: bool = True,
                           use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
+    """Full document parsing — returns response matching official Baidu API format."""
     tmp_path = save_temp_image(file_data)
     request_id = str(uuid.uuid4())[:12]
     try:
+        # Get image dimensions
+        try:
+            img = Image.open(tmp_path)
+            img_width, img_height = img.size
+            img.close()
+        except Exception:
+            img_width, img_height = 0, 0
         pipe = get_pipeline()
         output = pipe.predict(tmp_path)
+        layout_parsing_results = []
+        preprocessed_images = []
+        data_info_pages = []
         for i, res in enumerate(output):
+            page_id = f"{request_id}_p{i}"
             output_dir = tempfile.mkdtemp()
+            # Save all outputs
             res.save_to_json(save_path=output_dir)
             res.save_to_markdown(save_path=output_dir)
             try:
                 res.save_to_img(save_path=output_dir)
             except Exception:
                 pass
+            # --- Read markdown ---
             md_text = ""
             md_files = [f for f in os.listdir(output_dir) if f.endswith(".md")]
             if md_files:
                 with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
                     md_text = f.read()
+            # --- Read JSON (contains prunedResult data) ---
             json_data = {}
             json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
             if json_files:
                 with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
                     json_data = json.load(f)
+            # --- Collect and serve all images ---
+            all_images = collect_images_from_dir(output_dir, page_id)
+            # --- Build outputImages ---
+            output_images = {}
+            for rel_path, url in all_images.items():
+                name = os.path.splitext(os.path.basename(rel_path))[0]
+                # Identify layout detection visualization
+                if "layout" in name.lower() or "det" in name.lower() or "vis" in name.lower():
+                    output_images["layout_det_res"] = url
+                else:
+                    output_images[name] = url
+            # --- Build markdown images map ---
+            md_images = {}
+            imgs_dir = os.path.join(output_dir, "imgs")
+            if os.path.exists(imgs_dir):
+                for fname in os.listdir(imgs_dir):
+                    ext = os.path.splitext(fname)[1].lower()
+                    if ext in IMAGE_EXTENSIONS:
+                        src = os.path.join(imgs_dir, fname)
+                        url = serve_file(src, page_id, fname)
+                        local_ref = f"imgs/{fname}"
+                        md_images[local_ref] = url
+                        # Replace references in markdown
+                        md_text = md_text.replace(f'src="{local_ref}"', f'src="{url}"')
+                        md_text = md_text.replace(f']({local_ref})', f']({url})')
+            # --- Serve input image ---
+            input_image_url = serve_file(tmp_path, page_id, f"input_img_{i}.jpg")
+            # --- Build prunedResult from JSON data or result object ---
+            pruned_result = {}
+            if json_data:
+                # Try to use the saved JSON directly
+                pruned_result = {
+                    "page_count": json_data.get("page_count", 1),
+                    "width": json_data.get("width", img_width),
+                    "height": json_data.get("height", img_height),
+                    "model_settings": json_data.get("model_settings", {
+                        "use_doc_preprocessor": False,
+                        "use_layout_detection": True,
+                        "use_chart_recognition": use_chart_recognition,
+                        "use_seal_recognition": True,
+                        "use_ocr_for_image_block": False,
+                        "format_block_content": True,
+                        "merge_layout_blocks": True,
+                        "markdown_ignore_labels": [
+                            "number", "footnote", "header",
+                            "header_image", "footer", "footer_image", "aside_text"
+                        ],
+                        "return_layout_polygon_points": True
+                    }),
+                    "parsing_res_list": json_data.get("parsing_res_list",
+                                        json_data.get("blocks", [])),
+                    "layout_det_res": json_data.get("layout_det_res",
+                                      json_data.get("det_res", {"boxes": []}))
+                }
+            else:
+                pruned_result = extract_pruned_result(res, i)
+            # Ensure dimensions are set
+            if not pruned_result.get("width"):
+                pruned_result["width"] = img_width
+            if not pruned_result.get("height"):
+                pruned_result["height"] = img_height
+            # --- Build page result ---
+            page_result = {
+                "prunedResult": pruned_result,
+                "markdown": {
+                    "text": md_text,
+                    "images": md_images
+                },
                 "outputImages": output_images,
+                "inputImage": input_image_url
+            }
+            layout_parsing_results.append(page_result)
+            preprocessed_images.append(input_image_url)
+            data_info_pages.append({
+                "width": img_width,
+                "height": img_height
             })
         return {
             "errorCode": 0,
             "result": {
+                "layoutParsingResults": layout_parsing_results if layout_parsing_results else [{
+                    "prunedResult": {
+                        "page_count": 0,
+                        "width": 0,
+                        "height": 0,
+                        "parsing_res_list": [],
+                        "layout_det_res": {"boxes": []}
+                    },
                     "markdown": {"text": "", "images": {}},
+                    "outputImages": {},
+                    "inputImage": ""
+                }],
+                "preprocessedImages": preprocessed_images,
+                "dataInfo": {
+                    "type": "image",
+                    "numPages": len(layout_parsing_results),
+                    "pages": data_info_pages
+                }
             }
         }
     finally:
         if os.path.exists(tmp_path):
             os.unlink(tmp_path)
+def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
+    """Element-level recognition via direct vLLM call."""
+    if file_data.startswith(("http://", "https://")):
+        image_url = file_data
+    else:
+        image_url = f"data:image/png;base64,{file_data}"
+    task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
+    response = openai_client.chat.completions.create(
+        model=VLLM_MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": task_prompt}
+            ]
+        }],
+        temperature=0.0
+    )
+    result_text = response.choices[0].message.content
+    return {
+        "errorCode": 0,
+        "result": {
+            "layoutParsingResults": [{
+                "prunedResult": {
+                    "page_count": 1,
+                    "width": 0,
+                    "height": 0,
+                    "parsing_res_list": [{
+                        "block_label": prompt_label,
+                        "block_content": result_text,
+                        "block_bbox": [],
+                        "block_id": 0,
+                        "block_order": 0,
+                        "group_id": 0,
+                        "global_block_id": 0,
+                        "global_group_id": 0,
+                        "block_polygon_points": []
+                    }],
+                    "layout_det_res": {"boxes": []}
+                },
+                "markdown": {"text": result_text, "images": {}},
+                "outputImages": {},
+                "prunedResult.spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
+            }]
+        }
+    }
 def _parse_spotting(text: str) -> dict:
     try:
         return json.loads(text)
 async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
     """
     Main OCR endpoint — compatible with the Gradio app.
+    Returns full JSON matching official Baidu API format.
     Body:
     {