devappsmi commited on
Commit
bd52104
Β·
verified Β·
1 Parent(s): cdfa6df

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +18 -0
  2. README.md +17 -7
  3. app.py +382 -0
  4. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ libgl1-mesa-glx \
7
+ libglib2.0-0 \
8
+ libgomp1 \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY app.py .
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,12 +1,22 @@
1
  ---
2
- title: Document Parse
3
- emoji: πŸ‘€
4
- colorFrom: indigo
5
- colorTo: gray
6
  sdk: docker
 
7
  pinned: false
8
- license: apache-2.0
9
- short_description: Bridge Server
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: PaddleOCR-VL-1.5 Bridge API
3
+ emoji: πŸ“„
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
 
 
9
  ---
10
 
11
+ # PaddleOCR-VL-1.5 Bridge API
12
+
13
+ Bridge server that connects to a vLLM backend for full document parsing.
14
+
15
+ ## Endpoints
16
+
17
+ - `GET /health` - Health check
18
+ - `GET /docs` - Swagger UI
19
+ - `POST /api/ocr` - Gradio-compatible OCR API
20
+ - `POST /api/parse` - File upload API
21
+ - `POST /api/parse/markdown` - Returns markdown only
22
+ - `POST /v1/chat/completions` - OpenAI-compatible proxy
app.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
3
+ ====================================================
4
+ Deploys on Hugging Face Spaces as a FastAPI app.
5
+ Connects to vLLM Docker running on your GPU server.
6
+
7
+ Architecture:
8
+ Gradio App (another HF Space or any client)
9
+ |
10
+ This HF Space (Bridge, port 7860)
11
+ |
12
+ Your GPU Server (vLLM Docker, 117.54.141.62:8000)
13
+
14
+ HF Space Settings β†’ Variables and secrets:
15
+ VLLM_SERVER_URL = http://117.54.141.62:8000/v1
16
+ API_KEY = (optional, for auth)
17
+
18
+ Your GPU Server:
19
+ docker run --rm --gpus all -p 8000:8000 -v ~/.cache/paddleocr:/root/.cache ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest-nvidia-gpu paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --host 0.0.0.0 --port 8000 --backend vllm
20
+
21
+ Gradio App HF Space env:
22
+ API_URL = https://<your-bridge-space>.hf.space/api/ocr
23
+ """
24
+
25
+ import base64
26
+ import json
27
+ import os
28
+ import tempfile
29
+ import traceback
30
+ from typing import Any, Dict, Optional
31
+
32
+ import uvicorn
33
+ from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
34
+ from fastapi.middleware.cors import CORSMiddleware
35
+ from openai import OpenAI
36
+
37
+ # =============================================================================
38
+ # Configuration
39
+ # =============================================================================
40
+ VLLM_SERVER_URL = os.environ.get("VLLM_SERVER_URL", "http://117.54.141.62:8000/v1")
41
+ VLLM_MODEL_NAME = os.environ.get("VLLM_MODEL_NAME", "PaddleOCR-VL-1.5-0.9B")
42
+ BRIDGE_PORT = int(os.environ.get("PORT", "7860")) # HF Spaces default port
43
+ API_KEY = os.environ.get("API_KEY", "")
44
+
45
+ # =============================================================================
46
+ # Initialize OpenAI client (for element-level recognition)
47
+ # =============================================================================
48
+ openai_client = OpenAI(
49
+ api_key="EMPTY",
50
+ base_url=VLLM_SERVER_URL,
51
+ timeout=600
52
+ )
53
+
54
+ # =============================================================================
55
+ # PaddleOCR pipeline (for full document parsing with layout detection)
56
+ # =============================================================================
57
+ pipeline = None
58
+
59
+
60
+ def get_pipeline():
61
+ """Lazy-load the PaddleOCR pipeline."""
62
+ global pipeline
63
+ if pipeline is None:
64
+ from paddleocr import PaddleOCRVL
65
+ pipeline = PaddleOCRVL(
66
+ vl_rec_backend="vllm-server",
67
+ vl_rec_server_url=VLLM_SERVER_URL
68
+ )
69
+ return pipeline
70
+
71
+
72
+ # =============================================================================
73
+ # FastAPI App
74
+ # =============================================================================
75
+ app = FastAPI(
76
+ title="PaddleOCR-VL-1.5 Bridge API",
77
+ description="Full document parsing API β€” bridge between Gradio UI and vLLM server",
78
+ version="1.0.0"
79
+ )
80
+
81
+ app.add_middleware(
82
+ CORSMiddleware,
83
+ allow_origins=["*"],
84
+ allow_credentials=True,
85
+ allow_methods=["*"],
86
+ allow_headers=["*"],
87
+ )
88
+
89
+
90
+ # =============================================================================
91
+ # Auth
92
+ # =============================================================================
93
+ def verify_auth(authorization: Optional[str] = None):
94
+ if API_KEY and API_KEY.strip():
95
+ if not authorization or authorization != f"Bearer {API_KEY}":
96
+ raise HTTPException(status_code=401, detail="Unauthorized")
97
+
98
+
99
+ # =============================================================================
100
+ # Helpers
101
+ # =============================================================================
102
+ TASK_PROMPTS = {
103
+ "ocr": "OCR:",
104
+ "formula": "Formula Recognition:",
105
+ "table": "Table Recognition:",
106
+ "chart": "Chart Recognition:",
107
+ "spotting": "Spotting:",
108
+ "seal": "Seal Recognition:",
109
+ }
110
+
111
+
112
+ def save_temp_image(file_data: str) -> str:
113
+ """Save base64 or URL image to temp file."""
114
+ if file_data.startswith(("http://", "https://")):
115
+ import requests as req
116
+ resp = req.get(file_data, timeout=120)
117
+ resp.raise_for_status()
118
+ content = resp.content
119
+ ct = resp.headers.get("content-type", "image/png")
120
+ ext = ".png"
121
+ if "jpeg" in ct or "jpg" in ct:
122
+ ext = ".jpg"
123
+ elif "webp" in ct:
124
+ ext = ".webp"
125
+ elif "bmp" in ct:
126
+ ext = ".bmp"
127
+ else:
128
+ content = base64.b64decode(file_data)
129
+ ext = ".png"
130
+
131
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
132
+ tmp.write(content)
133
+ tmp.close()
134
+ return tmp.name
135
+
136
+
137
+ def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
138
+ """Element-level recognition via direct vLLM call."""
139
+ if file_data.startswith(("http://", "https://")):
140
+ image_url = file_data
141
+ else:
142
+ image_url = f"data:image/png;base64,{file_data}"
143
+
144
+ task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
145
+
146
+ response = openai_client.chat.completions.create(
147
+ model=VLLM_MODEL_NAME,
148
+ messages=[{
149
+ "role": "user",
150
+ "content": [
151
+ {"type": "image_url", "image_url": {"url": image_url}},
152
+ {"type": "text", "text": task_prompt}
153
+ ]
154
+ }],
155
+ temperature=0.0
156
+ )
157
+
158
+ result_text = response.choices[0].message.content
159
+
160
+ return {
161
+ "errorCode": 0,
162
+ "result": {
163
+ "layoutParsingResults": [{
164
+ "markdown": {"text": result_text, "images": {}},
165
+ "outputImages": {},
166
+ "prunedResult": {
167
+ "spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
168
+ }
169
+ }]
170
+ }
171
+ }
172
+
173
+
174
+ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
175
+ use_doc_unwarping: bool = True,
176
+ use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
177
+ """Full document parsing with layout detection + VLM recognition."""
178
+ tmp_path = save_temp_image(file_data)
179
+
180
+ try:
181
+ pipe = get_pipeline()
182
+ output = pipe.predict(tmp_path)
183
+
184
+ results = []
185
+ for i, res in enumerate(output):
186
+ output_dir = tempfile.mkdtemp()
187
+ res.save_to_json(save_path=output_dir)
188
+ res.save_to_markdown(save_path=output_dir)
189
+
190
+ md_text = ""
191
+ md_files = [f for f in os.listdir(output_dir) if f.endswith(".md")]
192
+ if md_files:
193
+ with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
194
+ md_text = f.read()
195
+
196
+ json_data = {}
197
+ json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
198
+ if json_files:
199
+ with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
200
+ json_data = json.load(f)
201
+
202
+ results.append({
203
+ "markdown": {"text": md_text, "images": {}},
204
+ "outputImages": {},
205
+ "jsonData": json_data
206
+ })
207
+
208
+ return {
209
+ "errorCode": 0,
210
+ "result": {
211
+ "layoutParsingResults": results if results else [{
212
+ "markdown": {"text": "", "images": {}},
213
+ "outputImages": {}
214
+ }]
215
+ }
216
+ }
217
+ finally:
218
+ if os.path.exists(tmp_path):
219
+ os.unlink(tmp_path)
220
+
221
+
222
+ def _parse_spotting(text: str) -> dict:
223
+ try:
224
+ return json.loads(text)
225
+ except (json.JSONDecodeError, TypeError):
226
+ return {"raw_text": text}
227
+
228
+
229
+ # =============================================================================
230
+ # Endpoints
231
+ # =============================================================================
232
+
233
+ @app.get("/")
234
+ async def root():
235
+ return {
236
+ "service": "PaddleOCR-VL-1.5 Bridge API",
237
+ "status": "running",
238
+ "endpoints": ["/health", "/api/ocr", "/api/parse", "/api/parse/markdown", "/v1/chat/completions", "/docs"]
239
+ }
240
+
241
+
242
+ @app.get("/health")
243
+ async def health():
244
+ return {"status": "ok", "model": VLLM_MODEL_NAME, "vllm_url": VLLM_SERVER_URL}
245
+
246
+
247
+ @app.post("/api/ocr")
248
+ async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
249
+ """
250
+ Main OCR endpoint β€” compatible with the Gradio app.
251
+
252
+ Body:
253
+ {
254
+ "file": "base64_or_url",
255
+ "useLayoutDetection": true/false,
256
+ "promptLabel": "ocr|formula|table|chart|spotting|seal",
257
+ "useChartRecognition": false,
258
+ "useDocUnwarping": true,
259
+ "useDocOrientationClassify": true
260
+ }
261
+ """
262
+ verify_auth(authorization)
263
+
264
+ try:
265
+ body = await request.json()
266
+ except Exception:
267
+ raise HTTPException(status_code=400, detail="Invalid JSON body")
268
+
269
+ file_data = body.get("file", "")
270
+ if not file_data:
271
+ raise HTTPException(status_code=400, detail="Missing 'file' field")
272
+
273
+ use_layout = body.get("useLayoutDetection", False)
274
+ prompt_label = body.get("promptLabel", "ocr")
275
+ use_chart = body.get("useChartRecognition", False)
276
+ use_unwarp = body.get("useDocUnwarping", True)
277
+ use_orient = body.get("useDocOrientationClassify", True)
278
+
279
+ try:
280
+ if use_layout:
281
+ return full_document_parsing(file_data, use_chart, use_unwarp, use_orient)
282
+ else:
283
+ return element_level_recognition(file_data, prompt_label)
284
+ except Exception as e:
285
+ traceback.print_exc()
286
+ return {"errorCode": -1, "errorMsg": str(e)}
287
+
288
+
289
+ @app.post("/api/parse")
290
+ async def parse_file(
291
+ file: UploadFile = File(...),
292
+ use_layout_detection: bool = True,
293
+ prompt_label: str = "ocr",
294
+ authorization: Optional[str] = Header(None)
295
+ ):
296
+ """
297
+ File upload endpoint.
298
+
299
+ curl -X POST https://<space>.hf.space/api/parse -F "file=@document.png"
300
+ """
301
+ verify_auth(authorization)
302
+ content = await file.read()
303
+ b64 = base64.b64encode(content).decode("utf-8")
304
+
305
+ try:
306
+ if use_layout_detection:
307
+ return full_document_parsing(b64)
308
+ else:
309
+ return element_level_recognition(b64, prompt_label)
310
+ except Exception as e:
311
+ traceback.print_exc()
312
+ return {"errorCode": -1, "errorMsg": str(e)}
313
+
314
+
315
+ @app.post("/api/parse/markdown")
316
+ async def parse_to_markdown(
317
+ file: UploadFile = File(...),
318
+ authorization: Optional[str] = Header(None)
319
+ ):
320
+ """
321
+ Returns just markdown text.
322
+
323
+ curl -X POST https://<space>.hf.space/api/parse/markdown -F "file=@document.png"
324
+ """
325
+ verify_auth(authorization)
326
+ content = await file.read()
327
+ b64 = base64.b64encode(content).decode("utf-8")
328
+
329
+ try:
330
+ result = full_document_parsing(b64)
331
+ pages = result.get("result", {}).get("layoutParsingResults", [])
332
+ markdown_parts = [p.get("markdown", {}).get("text", "") for p in pages if p.get("markdown", {}).get("text")]
333
+ return {
334
+ "status": "ok",
335
+ "markdown": "\n\n---\n\n".join(markdown_parts),
336
+ "page_count": len(pages)
337
+ }
338
+ except Exception as e:
339
+ traceback.print_exc()
340
+ raise HTTPException(status_code=500, detail=str(e))
341
+
342
+
343
+ @app.post("/v1/chat/completions")
344
+ async def proxy_chat_completions(request: Request, authorization: Optional[str] = Header(None)):
345
+ """Proxy to vLLM for direct OpenAI-compatible calls."""
346
+ verify_auth(authorization)
347
+
348
+ import httpx
349
+ body = await request.json()
350
+
351
+ async with httpx.AsyncClient(timeout=600) as client:
352
+ resp = await client.post(
353
+ f"{VLLM_SERVER_URL}/chat/completions",
354
+ json=body,
355
+ headers={"Content-Type": "application/json"}
356
+ )
357
+ return resp.json()
358
+
359
+
360
+ # =============================================================================
361
+ # Entry point
362
+ # =============================================================================
363
+ if __name__ == "__main__":
364
+ print(f"""
365
+ ╔══════════════════════════════════════════════════════════════╗
366
+ β•‘ PaddleOCR-VL-1.5 Bridge Server (HF Spaces) β•‘
367
+ ╠══════════════════════════════════════════════════════════════╣
368
+ β•‘ Bridge API: http://0.0.0.0:{BRIDGE_PORT} β•‘
369
+ β•‘ vLLM backend: {VLLM_SERVER_URL:<44s}β•‘
370
+ β•‘ Model: {VLLM_MODEL_NAME:<44s}β•‘
371
+ β•‘ Auth: {"ENABLED" if API_KEY else "DISABLED":<44s}β•‘
372
+ ╠══════════════════════════════════════════════════════════════╣
373
+ β•‘ Endpoints: β•‘
374
+ β•‘ GET /health - Health check β•‘
375
+ β•‘ GET /docs - Swagger UI β•‘
376
+ β•‘ POST /api/ocr - Gradio-compatible API β•‘
377
+ β•‘ POST /api/parse - File upload API β•‘
378
+ β•‘ POST /api/parse/markdown - Simple markdown output β•‘
379
+ β•‘ POST /v1/chat/completions - vLLM proxy (OpenAI format) β•‘
380
+ β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
381
+ """)
382
+ uvicorn.run(app, host="0.0.0.0", port=BRIDGE_PORT)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-multipart
4
+ openai
5
+ httpx
6
+ requests
7
+ paddleocr[doc-parser]
8
+ paddlepaddle==3.2.1