Spaces:

devappsmi
/

document_parse

Sleeping

App Files Files Community

devappsmi commited on 19 days ago

Commit

bd52104

verified ·

1 Parent(s): cdfa6df

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +18 -0
README.md +17 -7
app.py +382 -0
requirements.txt +8 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app.py .
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,22 @@
 ---
-title: Document Parse
-emoji: 👀
-colorFrom: indigo
-colorTo: gray
 sdk: docker
 pinned: false
-license: apache-2.0
-short_description: Bridge Server
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PaddleOCR-VL-1.5 Bridge API
+emoji: 📄
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
 ---
+# PaddleOCR-VL-1.5 Bridge API
+Bridge server that connects to a vLLM backend for full document parsing.
+## Endpoints
+- `GET /health` - Health check
+- `GET /docs` - Swagger UI
+- `POST /api/ocr` - Gradio-compatible OCR API
+- `POST /api/parse` - File upload API
+- `POST /api/parse/markdown` - Returns markdown only
+- `POST /v1/chat/completions` - OpenAI-compatible proxy

app.py ADDED Viewed

	@@ -0,0 +1,382 @@

+"""
+PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
+====================================================
+Deploys on Hugging Face Spaces as a FastAPI app.
+Connects to vLLM Docker running on your GPU server.
+Architecture:
+    Gradio App (another HF Space or any client)
+         |
+    This HF Space (Bridge, port 7860)
+         |
+    Your GPU Server (vLLM Docker, 117.54.141.62:8000)
+HF Space Settings → Variables and secrets:
+    VLLM_SERVER_URL = http://117.54.141.62:8000/v1
+    API_KEY = (optional, for auth)
+Your GPU Server:
+    docker run --rm --gpus all -p 8000:8000 -v ~/.cache/paddleocr:/root/.cache ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest-nvidia-gpu paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --host 0.0.0.0 --port 8000 --backend vllm
+Gradio App HF Space env:
+    API_URL = https://<your-bridge-space>.hf.space/api/ocr
+"""
+import base64
+import json
+import os
+import tempfile
+import traceback
+from typing import Any, Dict, Optional
+import uvicorn
+from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from openai import OpenAI
+# =============================================================================
+# Configuration
+# =============================================================================
+VLLM_SERVER_URL = os.environ.get("VLLM_SERVER_URL", "http://117.54.141.62:8000/v1")
+VLLM_MODEL_NAME = os.environ.get("VLLM_MODEL_NAME", "PaddleOCR-VL-1.5-0.9B")
+BRIDGE_PORT = int(os.environ.get("PORT", "7860"))  # HF Spaces default port
+API_KEY = os.environ.get("API_KEY", "")
+# =============================================================================
+# Initialize OpenAI client (for element-level recognition)
+# =============================================================================
+openai_client = OpenAI(
+    api_key="EMPTY",
+    base_url=VLLM_SERVER_URL,
+    timeout=600
+)
+# =============================================================================
+# PaddleOCR pipeline (for full document parsing with layout detection)
+# =============================================================================
+pipeline = None
+def get_pipeline():
+    """Lazy-load the PaddleOCR pipeline."""
+    global pipeline
+    if pipeline is None:
+        from paddleocr import PaddleOCRVL
+        pipeline = PaddleOCRVL(
+            vl_rec_backend="vllm-server",
+            vl_rec_server_url=VLLM_SERVER_URL
+        )
+    return pipeline
+# =============================================================================
+# FastAPI App
+# =============================================================================
+app = FastAPI(
+    title="PaddleOCR-VL-1.5 Bridge API",
+    description="Full document parsing API — bridge between Gradio UI and vLLM server",
+    version="1.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# =============================================================================
+# Auth
+# =============================================================================
+def verify_auth(authorization: Optional[str] = None):
+    if API_KEY and API_KEY.strip():
+        if not authorization or authorization != f"Bearer {API_KEY}":
+            raise HTTPException(status_code=401, detail="Unauthorized")
+# =============================================================================
+# Helpers
+# =============================================================================
+TASK_PROMPTS = {
+    "ocr": "OCR:",
+    "formula": "Formula Recognition:",
+    "table": "Table Recognition:",
+    "chart": "Chart Recognition:",
+    "spotting": "Spotting:",
+    "seal": "Seal Recognition:",
+}
+def save_temp_image(file_data: str) -> str:
+    """Save base64 or URL image to temp file."""
+    if file_data.startswith(("http://", "https://")):
+        import requests as req
+        resp = req.get(file_data, timeout=120)
+        resp.raise_for_status()
+        content = resp.content
+        ct = resp.headers.get("content-type", "image/png")
+        ext = ".png"
+        if "jpeg" in ct or "jpg" in ct:
+            ext = ".jpg"
+        elif "webp" in ct:
+            ext = ".webp"
+        elif "bmp" in ct:
+            ext = ".bmp"
+    else:
+        content = base64.b64decode(file_data)
+        ext = ".png"
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
+    tmp.write(content)
+    tmp.close()
+    return tmp.name
+def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
+    """Element-level recognition via direct vLLM call."""
+    if file_data.startswith(("http://", "https://")):
+        image_url = file_data
+    else:
+        image_url = f"data:image/png;base64,{file_data}"
+    task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
+    response = openai_client.chat.completions.create(
+        model=VLLM_MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": task_prompt}
+            ]
+        }],
+        temperature=0.0
+    )
+    result_text = response.choices[0].message.content
+    return {
+        "errorCode": 0,
+        "result": {
+            "layoutParsingResults": [{
+                "markdown": {"text": result_text, "images": {}},
+                "outputImages": {},
+                "prunedResult": {
+                    "spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
+                }
+            }]
+        }
+    }
+def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
+                          use_doc_unwarping: bool = True,
+                          use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
+    """Full document parsing with layout detection + VLM recognition."""
+    tmp_path = save_temp_image(file_data)
+    try:
+        pipe = get_pipeline()
+        output = pipe.predict(tmp_path)
+        results = []
+        for i, res in enumerate(output):
+            output_dir = tempfile.mkdtemp()
+            res.save_to_json(save_path=output_dir)
+            res.save_to_markdown(save_path=output_dir)
+            md_text = ""
+            md_files = [f for f in os.listdir(output_dir) if f.endswith(".md")]
+            if md_files:
+                with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
+                    md_text = f.read()
+            json_data = {}
+            json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
+            if json_files:
+                with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
+                    json_data = json.load(f)
+            results.append({
+                "markdown": {"text": md_text, "images": {}},
+                "outputImages": {},
+                "jsonData": json_data
+            })
+        return {
+            "errorCode": 0,
+            "result": {
+                "layoutParsingResults": results if results else [{
+                    "markdown": {"text": "", "images": {}},
+                    "outputImages": {}
+                }]
+            }
+        }
+    finally:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+def _parse_spotting(text: str) -> dict:
+    try:
+        return json.loads(text)
+    except (json.JSONDecodeError, TypeError):
+        return {"raw_text": text}
+# =============================================================================
+# Endpoints
+# =============================================================================
+@app.get("/")
+async def root():
+    return {
+        "service": "PaddleOCR-VL-1.5 Bridge API",
+        "status": "running",
+        "endpoints": ["/health", "/api/ocr", "/api/parse", "/api/parse/markdown", "/v1/chat/completions", "/docs"]
+    }
+@app.get("/health")
+async def health():
+    return {"status": "ok", "model": VLLM_MODEL_NAME, "vllm_url": VLLM_SERVER_URL}
+@app.post("/api/ocr")
+async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
+    """
+    Main OCR endpoint — compatible with the Gradio app.
+    Body:
+    {
+        "file": "base64_or_url",
+        "useLayoutDetection": true/false,
+        "promptLabel": "ocr|formula|table|chart|spotting|seal",
+        "useChartRecognition": false,
+        "useDocUnwarping": true,
+        "useDocOrientationClassify": true
+    }
+    """
+    verify_auth(authorization)
+    try:
+        body = await request.json()
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid JSON body")
+    file_data = body.get("file", "")
+    if not file_data:
+        raise HTTPException(status_code=400, detail="Missing 'file' field")
+    use_layout = body.get("useLayoutDetection", False)
+    prompt_label = body.get("promptLabel", "ocr")
+    use_chart = body.get("useChartRecognition", False)
+    use_unwarp = body.get("useDocUnwarping", True)
+    use_orient = body.get("useDocOrientationClassify", True)
+    try:
+        if use_layout:
+            return full_document_parsing(file_data, use_chart, use_unwarp, use_orient)
+        else:
+            return element_level_recognition(file_data, prompt_label)
+    except Exception as e:
+        traceback.print_exc()
+        return {"errorCode": -1, "errorMsg": str(e)}
+@app.post("/api/parse")
+async def parse_file(
+    file: UploadFile = File(...),
+    use_layout_detection: bool = True,
+    prompt_label: str = "ocr",
+    authorization: Optional[str] = Header(None)
+):
+    """
+    File upload endpoint.
+    curl -X POST https://<space>.hf.space/api/parse -F "file=@document.png"
+    """
+    verify_auth(authorization)
+    content = await file.read()
+    b64 = base64.b64encode(content).decode("utf-8")
+    try:
+        if use_layout_detection:
+            return full_document_parsing(b64)
+        else:
+            return element_level_recognition(b64, prompt_label)
+    except Exception as e:
+        traceback.print_exc()
+        return {"errorCode": -1, "errorMsg": str(e)}
+@app.post("/api/parse/markdown")
+async def parse_to_markdown(
+    file: UploadFile = File(...),
+    authorization: Optional[str] = Header(None)
+):
+    """
+    Returns just markdown text.
+    curl -X POST https://<space>.hf.space/api/parse/markdown -F "file=@document.png"
+    """
+    verify_auth(authorization)
+    content = await file.read()
+    b64 = base64.b64encode(content).decode("utf-8")
+    try:
+        result = full_document_parsing(b64)
+        pages = result.get("result", {}).get("layoutParsingResults", [])
+        markdown_parts = [p.get("markdown", {}).get("text", "") for p in pages if p.get("markdown", {}).get("text")]
+        return {
+            "status": "ok",
+            "markdown": "\n\n---\n\n".join(markdown_parts),
+            "page_count": len(pages)
+        }
+    except Exception as e:
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/v1/chat/completions")
+async def proxy_chat_completions(request: Request, authorization: Optional[str] = Header(None)):
+    """Proxy to vLLM for direct OpenAI-compatible calls."""
+    verify_auth(authorization)
+    import httpx
+    body = await request.json()
+    async with httpx.AsyncClient(timeout=600) as client:
+        resp = await client.post(
+            f"{VLLM_SERVER_URL}/chat/completions",
+            json=body,
+            headers={"Content-Type": "application/json"}
+        )
+        return resp.json()
+# =============================================================================
+# Entry point
+# =============================================================================
+if __name__ == "__main__":
+    print(f"""
+╔══════════════════════════════════════════════════════════════╗
+║     PaddleOCR-VL-1.5 Bridge Server (HF Spaces)             ║
+╠══════════════════════════════════════════════════════════════╣
+║  Bridge API:   http://0.0.0.0:{BRIDGE_PORT}                          ║
+║  vLLM backend: {VLLM_SERVER_URL:<44s}║
+║  Model:        {VLLM_MODEL_NAME:<44s}║
+║  Auth:         {"ENABLED" if API_KEY else "DISABLED":<44s}║
+╠══════════════════════════════════════════════════════════════╣
+║  Endpoints:                                                  ║
+║    GET  /health              - Health check                  ║
+║    GET  /docs                - Swagger UI                    ║
+║    POST /api/ocr             - Gradio-compatible API         ║
+║    POST /api/parse           - File upload API               ║
+║    POST /api/parse/markdown  - Simple markdown output        ║
+║    POST /v1/chat/completions - vLLM proxy (OpenAI format)    ║
+╚══════════════════════════════════════════════════════════════╝
+    """)
+    uvicorn.run(app, host="0.0.0.0", port=BRIDGE_PORT)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+uvicorn[standard]
+python-multipart
+openai
+httpx
+requests
+paddleocr[doc-parser]
+paddlepaddle==3.2.1