devappsmi commited on
Commit
3771183
·
verified ·
1 Parent(s): 84ef6a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -97
app.py CHANGED
@@ -1,19 +1,16 @@
1
  """
2
  PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
3
  ====================================================
4
- Deploys on Hugging Face Spaces as a FastAPI app.
5
- Connects to vLLM Docker running on your GPU server.
 
 
 
 
 
6
 
7
  Architecture:
8
- Gradio App (another HF Space or any client)
9
- |
10
- This HF Space (Bridge, port 7860)
11
- |
12
- Your GPU Server (vLLM Docker, 117.54.141.62:8000)
13
-
14
- HF Space Settings → Variables and secrets:
15
- VLLM_SERVER_URL = http://117.54.141.62:8000/v1
16
- API_KEY = (optional, for auth)
17
  """
18
 
19
  import base64
@@ -23,13 +20,14 @@ import shutil
23
  import tempfile
24
  import traceback
25
  import uuid
26
- from typing import Any, Dict, Optional
27
 
28
  import uvicorn
29
  from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
30
  from fastapi.middleware.cors import CORSMiddleware
31
  from fastapi.staticfiles import StaticFiles
32
  from openai import OpenAI
 
33
 
34
  # =============================================================================
35
  # Configuration
@@ -38,19 +36,18 @@ VLLM_SERVER_URL = os.environ.get("VLLM_SERVER_URL", "http://117.54.141.62:8000/v
38
  VLLM_MODEL_NAME = os.environ.get("VLLM_MODEL_NAME", "PaddleOCR-VL-1.5-0.9B")
39
  BRIDGE_PORT = int(os.environ.get("PORT", "7860"))
40
  API_KEY = os.environ.get("API_KEY", "")
41
- # Public base URL for serving static files (auto-detect from HF Space)
42
  SPACE_HOST = os.environ.get("SPACE_HOST", "")
43
  if SPACE_HOST:
44
  PUBLIC_BASE_URL = f"https://{SPACE_HOST}"
45
  else:
46
  PUBLIC_BASE_URL = os.environ.get("PUBLIC_BASE_URL", f"http://localhost:{BRIDGE_PORT}")
47
 
48
- # Directory to store and serve output images
49
  STATIC_DIR = "/tmp/ocr_outputs"
50
  os.makedirs(STATIC_DIR, exist_ok=True)
51
 
52
  # =============================================================================
53
- # Initialize OpenAI client
54
  # =============================================================================
55
  openai_client = OpenAI(
56
  api_key="EMPTY",
@@ -58,14 +55,10 @@ openai_client = OpenAI(
58
  timeout=600
59
  )
60
 
61
- # =============================================================================
62
- # PaddleOCR pipeline
63
- # =============================================================================
64
  pipeline = None
65
 
66
 
67
  def get_pipeline():
68
- """Lazy-load the PaddleOCR pipeline."""
69
  global pipeline
70
  if pipeline is None:
71
  from paddleocr import PaddleOCRVL
@@ -81,7 +74,7 @@ def get_pipeline():
81
  # =============================================================================
82
  app = FastAPI(
83
  title="PaddleOCR-VL-1.5 Bridge API",
84
- description="Full document parsing API bridge between Gradio UI and vLLM server",
85
  version="1.0.0"
86
  )
87
 
@@ -93,7 +86,6 @@ app.add_middleware(
93
  allow_headers=["*"],
94
  )
95
 
96
- # Serve static files (output images)
97
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
98
 
99
 
@@ -122,7 +114,6 @@ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
122
 
123
 
124
  def save_temp_image(file_data: str) -> str:
125
- """Save base64 or URL image to temp file."""
126
  if file_data.startswith(("http://", "https://")):
127
  import requests as req
128
  resp = req.get(file_data, timeout=120)
@@ -146,139 +137,338 @@ def save_temp_image(file_data: str) -> str:
146
  return tmp.name
147
 
148
 
149
- def collect_output_images(output_dir: str, request_id: str) -> Dict[str, str]:
150
- """
151
- Find all image files in the output directory,
152
- copy them to the static dir, and return a dict of {name: public_url}.
153
- """
154
- output_images = {}
155
- if not os.path.exists(output_dir):
156
- return output_images
157
-
158
- # Create a subdirectory for this request
159
  static_subdir = os.path.join(STATIC_DIR, request_id)
160
  os.makedirs(static_subdir, exist_ok=True)
161
-
162
- for root, dirs, files in os.walk(output_dir):
163
- for filename in files:
164
- ext = os.path.splitext(filename)[1].lower()
 
 
 
 
 
 
 
 
 
165
  if ext in IMAGE_EXTENSIONS:
166
- src_path = os.path.join(root, filename)
167
- dst_path = os.path.join(static_subdir, filename)
168
- shutil.copy2(src_path, dst_path)
169
- public_url = f"{PUBLIC_BASE_URL}/static/{request_id}/{filename}"
170
- output_images[filename] = public_url
171
-
172
- return output_images
173
 
174
 
175
- def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
176
- """Element-level recognition via direct vLLM call."""
177
- if file_data.startswith(("http://", "https://")):
178
- image_url = file_data
179
- else:
180
- image_url = f"data:image/png;base64,{file_data}"
181
-
182
- task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
183
-
184
- response = openai_client.chat.completions.create(
185
- model=VLLM_MODEL_NAME,
186
- messages=[{
187
- "role": "user",
188
- "content": [
189
- {"type": "image_url", "image_url": {"url": image_url}},
190
- {"type": "text", "text": task_prompt}
191
- ]
192
- }],
193
- temperature=0.0
194
- )
195
 
196
- result_text = response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- return {
199
- "errorCode": 0,
200
- "result": {
201
- "layoutParsingResults": [{
202
- "markdown": {"text": result_text, "images": {}},
203
- "outputImages": {},
204
- "prunedResult": {
205
- "spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
206
- }
207
- }]
208
  }
209
- }
 
210
 
211
 
212
  def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
213
  use_doc_unwarping: bool = True,
214
  use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
215
- """Full document parsing with layout detection + VLM recognition."""
216
  tmp_path = save_temp_image(file_data)
217
  request_id = str(uuid.uuid4())[:12]
218
 
219
  try:
 
 
 
 
 
 
 
 
220
  pipe = get_pipeline()
221
  output = pipe.predict(tmp_path)
222
 
223
- results = []
 
 
 
224
  for i, res in enumerate(output):
 
225
  output_dir = tempfile.mkdtemp()
226
 
227
- # Save all outputs (json, markdown, images)
228
  res.save_to_json(save_path=output_dir)
229
  res.save_to_markdown(save_path=output_dir)
230
-
231
- # Try to save visualization image
232
  try:
233
  res.save_to_img(save_path=output_dir)
234
  except Exception:
235
  pass
236
 
237
- # Read markdown
238
  md_text = ""
239
  md_files = [f for f in os.listdir(output_dir) if f.endswith(".md")]
240
  if md_files:
241
  with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
242
  md_text = f.read()
243
 
244
- # Read JSON
245
  json_data = {}
246
  json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
247
  if json_files:
248
  with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
249
  json_data = json.load(f)
250
 
251
- # Collect and serve output images
252
- page_request_id = f"{request_id}_page{i}"
253
- output_images = collect_output_images(output_dir, page_request_id)
254
 
255
- # Also check for images referenced in markdown
256
- md_images = {}
257
- for fname, url in output_images.items():
258
- # Replace local paths in markdown with public URLs
259
- md_text = md_text.replace(fname, url)
260
- md_images[fname] = url
 
 
 
261
 
262
- results.append({
263
- "markdown": {"text": md_text, "images": md_images},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  "outputImages": output_images,
265
- "jsonData": json_data
 
 
 
 
 
 
 
266
  })
267
 
268
  return {
269
  "errorCode": 0,
270
  "result": {
271
- "layoutParsingResults": results if results else [{
 
 
 
 
 
 
 
272
  "markdown": {"text": "", "images": {}},
273
- "outputImages": {}
274
- }]
 
 
 
 
 
 
 
275
  }
276
  }
 
277
  finally:
278
  if os.path.exists(tmp_path):
279
  os.unlink(tmp_path)
280
 
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def _parse_spotting(text: str) -> dict:
283
  try:
284
  return json.loads(text)
@@ -308,6 +498,7 @@ async def health():
308
  async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
309
  """
310
  Main OCR endpoint — compatible with the Gradio app.
 
311
 
312
  Body:
313
  {
 
1
  """
2
  PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
3
  ====================================================
4
+ Returns full JSON response matching the official Baidu API format, including:
5
+ - layoutParsingResults[].prunedResult (blocks, labels, bboxes, polygon points)
6
+ - layoutParsingResults[].markdown (text + images)
7
+ - layoutParsingResults[].outputImages (visualization URLs)
8
+ - layoutParsingResults[].inputImage
9
+ - preprocessedImages
10
+ - dataInfo
11
 
12
  Architecture:
13
+ Gradio App → This Bridge (port 7860) vLLM Docker (117.54.141.62:8000)
 
 
 
 
 
 
 
 
14
  """
15
 
16
  import base64
 
20
  import tempfile
21
  import traceback
22
  import uuid
23
+ from typing import Any, Dict, List, Optional
24
 
25
  import uvicorn
26
  from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
27
  from fastapi.middleware.cors import CORSMiddleware
28
  from fastapi.staticfiles import StaticFiles
29
  from openai import OpenAI
30
+ from PIL import Image
31
 
32
  # =============================================================================
33
  # Configuration
 
36
  VLLM_MODEL_NAME = os.environ.get("VLLM_MODEL_NAME", "PaddleOCR-VL-1.5-0.9B")
37
  BRIDGE_PORT = int(os.environ.get("PORT", "7860"))
38
  API_KEY = os.environ.get("API_KEY", "")
39
+
40
  SPACE_HOST = os.environ.get("SPACE_HOST", "")
41
  if SPACE_HOST:
42
  PUBLIC_BASE_URL = f"https://{SPACE_HOST}"
43
  else:
44
  PUBLIC_BASE_URL = os.environ.get("PUBLIC_BASE_URL", f"http://localhost:{BRIDGE_PORT}")
45
 
 
46
  STATIC_DIR = "/tmp/ocr_outputs"
47
  os.makedirs(STATIC_DIR, exist_ok=True)
48
 
49
  # =============================================================================
50
+ # Initialize clients
51
  # =============================================================================
52
  openai_client = OpenAI(
53
  api_key="EMPTY",
 
55
  timeout=600
56
  )
57
 
 
 
 
58
  pipeline = None
59
 
60
 
61
  def get_pipeline():
 
62
  global pipeline
63
  if pipeline is None:
64
  from paddleocr import PaddleOCRVL
 
74
  # =============================================================================
75
  app = FastAPI(
76
  title="PaddleOCR-VL-1.5 Bridge API",
77
+ description="Full document parsing API matching official Baidu API format",
78
  version="1.0.0"
79
  )
80
 
 
86
  allow_headers=["*"],
87
  )
88
 
 
89
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
90
 
91
 
 
114
 
115
 
116
  def save_temp_image(file_data: str) -> str:
 
117
  if file_data.startswith(("http://", "https://")):
118
  import requests as req
119
  resp = req.get(file_data, timeout=120)
 
137
  return tmp.name
138
 
139
 
140
+ def serve_file(src_path: str, request_id: str, filename: str) -> str:
141
+ """Copy a file to the static dir and return its public URL."""
 
 
 
 
 
 
 
 
142
  static_subdir = os.path.join(STATIC_DIR, request_id)
143
  os.makedirs(static_subdir, exist_ok=True)
144
+ dst_path = os.path.join(static_subdir, filename)
145
+ shutil.copy2(src_path, dst_path)
146
+ return f"{PUBLIC_BASE_URL}/static/{request_id}/{filename}"
147
+
148
+
149
+ def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
150
+ """Find all images in a directory and serve them. Returns {filename: url}."""
151
+ result = {}
152
+ if not os.path.exists(directory):
153
+ return result
154
+ for root, dirs, files in os.walk(directory):
155
+ for fname in files:
156
+ ext = os.path.splitext(fname)[1].lower()
157
  if ext in IMAGE_EXTENSIONS:
158
+ src = os.path.join(root, fname)
159
+ # Preserve subdirectory structure in the filename
160
+ rel_path = os.path.relpath(src, directory)
161
+ safe_name = rel_path.replace(os.sep, "_")
162
+ url = serve_file(src, request_id, safe_name)
163
+ result[rel_path] = url
164
+ return result
165
 
166
 
167
+ def extract_pruned_result(res_obj, page_index: int = 0) -> Dict[str, Any]:
168
+ """
169
+ Extract the full prunedResult from a PaddleOCR result object,
170
+ matching the official Baidu API format.
171
+ """
172
+ pruned = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ try:
175
+ # Try to get the raw dict/json from the result object
176
+ if hasattr(res_obj, 'json'):
177
+ raw = res_obj.json if isinstance(res_obj.json, dict) else {}
178
+ elif hasattr(res_obj, '_result'):
179
+ raw = res_obj._result if isinstance(res_obj._result, dict) else {}
180
+ elif hasattr(res_obj, 'to_dict'):
181
+ raw = res_obj.to_dict()
182
+ else:
183
+ raw = {}
184
+
185
+ # Try multiple attribute paths to find the parsing results
186
+ parsing_res_list = []
187
+ layout_det_res = {"boxes": []}
188
+
189
+ # Check common attribute names
190
+ for attr in ['parsing_res_list', 'parsing_result', 'blocks']:
191
+ if hasattr(res_obj, attr):
192
+ parsing_res_list = getattr(res_obj, attr, [])
193
+ break
194
+
195
+ # Check for layout detection results
196
+ for attr in ['layout_det_res', 'layout_result', 'det_res']:
197
+ if hasattr(res_obj, attr):
198
+ layout_det_res = getattr(res_obj, attr, {})
199
+ break
200
+
201
+ # Get image dimensions
202
+ width = 0
203
+ height = 0
204
+ for attr in ['img_width', 'width']:
205
+ if hasattr(res_obj, attr):
206
+ width = getattr(res_obj, attr, 0)
207
+ break
208
+ for attr in ['img_height', 'height']:
209
+ if hasattr(res_obj, attr):
210
+ height = getattr(res_obj, attr, 0)
211
+ break
212
+
213
+ # If we got raw dict, try to extract from it
214
+ if raw and not parsing_res_list:
215
+ parsing_res_list = raw.get('parsing_res_list', raw.get('blocks', []))
216
+ layout_det_res = raw.get('layout_det_res', {"boxes": []})
217
+ width = raw.get('width', width)
218
+ height = raw.get('height', height)
219
+
220
+ pruned = {
221
+ "page_count": 1,
222
+ "width": width,
223
+ "height": height,
224
+ "model_settings": {
225
+ "use_doc_preprocessor": False,
226
+ "use_layout_detection": True,
227
+ "use_chart_recognition": False,
228
+ "use_seal_recognition": True,
229
+ "use_ocr_for_image_block": False,
230
+ "format_block_content": True,
231
+ "merge_layout_blocks": True,
232
+ "markdown_ignore_labels": [
233
+ "number", "footnote", "header",
234
+ "header_image", "footer", "footer_image", "aside_text"
235
+ ],
236
+ "return_layout_polygon_points": True
237
+ },
238
+ "parsing_res_list": parsing_res_list if isinstance(parsing_res_list, list) else [],
239
+ "layout_det_res": layout_det_res if isinstance(layout_det_res, dict) else {"boxes": []}
240
+ }
241
 
242
+ except Exception as e:
243
+ print(f"Warning: Could not extract prunedResult: {e}")
244
+ traceback.print_exc()
245
+ pruned = {
246
+ "page_count": 1,
247
+ "width": 0,
248
+ "height": 0,
249
+ "model_settings": {},
250
+ "parsing_res_list": [],
251
+ "layout_det_res": {"boxes": []}
252
  }
253
+
254
+ return pruned
255
 
256
 
257
  def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
258
  use_doc_unwarping: bool = True,
259
  use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
260
+ """Full document parsing returns response matching official Baidu API format."""
261
  tmp_path = save_temp_image(file_data)
262
  request_id = str(uuid.uuid4())[:12]
263
 
264
  try:
265
+ # Get image dimensions
266
+ try:
267
+ img = Image.open(tmp_path)
268
+ img_width, img_height = img.size
269
+ img.close()
270
+ except Exception:
271
+ img_width, img_height = 0, 0
272
+
273
  pipe = get_pipeline()
274
  output = pipe.predict(tmp_path)
275
 
276
+ layout_parsing_results = []
277
+ preprocessed_images = []
278
+ data_info_pages = []
279
+
280
  for i, res in enumerate(output):
281
+ page_id = f"{request_id}_p{i}"
282
  output_dir = tempfile.mkdtemp()
283
 
284
+ # Save all outputs
285
  res.save_to_json(save_path=output_dir)
286
  res.save_to_markdown(save_path=output_dir)
 
 
287
  try:
288
  res.save_to_img(save_path=output_dir)
289
  except Exception:
290
  pass
291
 
292
+ # --- Read markdown ---
293
  md_text = ""
294
  md_files = [f for f in os.listdir(output_dir) if f.endswith(".md")]
295
  if md_files:
296
  with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
297
  md_text = f.read()
298
 
299
+ # --- Read JSON (contains prunedResult data) ---
300
  json_data = {}
301
  json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
302
  if json_files:
303
  with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
304
  json_data = json.load(f)
305
 
306
+ # --- Collect and serve all images ---
307
+ all_images = collect_images_from_dir(output_dir, page_id)
 
308
 
309
+ # --- Build outputImages ---
310
+ output_images = {}
311
+ for rel_path, url in all_images.items():
312
+ name = os.path.splitext(os.path.basename(rel_path))[0]
313
+ # Identify layout detection visualization
314
+ if "layout" in name.lower() or "det" in name.lower() or "vis" in name.lower():
315
+ output_images["layout_det_res"] = url
316
+ else:
317
+ output_images[name] = url
318
 
319
+ # --- Build markdown images map ---
320
+ md_images = {}
321
+ imgs_dir = os.path.join(output_dir, "imgs")
322
+ if os.path.exists(imgs_dir):
323
+ for fname in os.listdir(imgs_dir):
324
+ ext = os.path.splitext(fname)[1].lower()
325
+ if ext in IMAGE_EXTENSIONS:
326
+ src = os.path.join(imgs_dir, fname)
327
+ url = serve_file(src, page_id, fname)
328
+ local_ref = f"imgs/{fname}"
329
+ md_images[local_ref] = url
330
+ # Replace references in markdown
331
+ md_text = md_text.replace(f'src="{local_ref}"', f'src="{url}"')
332
+ md_text = md_text.replace(f']({local_ref})', f']({url})')
333
+
334
+ # --- Serve input image ---
335
+ input_image_url = serve_file(tmp_path, page_id, f"input_img_{i}.jpg")
336
+
337
+ # --- Build prunedResult from JSON data or result object ---
338
+ pruned_result = {}
339
+ if json_data:
340
+ # Try to use the saved JSON directly
341
+ pruned_result = {
342
+ "page_count": json_data.get("page_count", 1),
343
+ "width": json_data.get("width", img_width),
344
+ "height": json_data.get("height", img_height),
345
+ "model_settings": json_data.get("model_settings", {
346
+ "use_doc_preprocessor": False,
347
+ "use_layout_detection": True,
348
+ "use_chart_recognition": use_chart_recognition,
349
+ "use_seal_recognition": True,
350
+ "use_ocr_for_image_block": False,
351
+ "format_block_content": True,
352
+ "merge_layout_blocks": True,
353
+ "markdown_ignore_labels": [
354
+ "number", "footnote", "header",
355
+ "header_image", "footer", "footer_image", "aside_text"
356
+ ],
357
+ "return_layout_polygon_points": True
358
+ }),
359
+ "parsing_res_list": json_data.get("parsing_res_list",
360
+ json_data.get("blocks", [])),
361
+ "layout_det_res": json_data.get("layout_det_res",
362
+ json_data.get("det_res", {"boxes": []}))
363
+ }
364
+ else:
365
+ pruned_result = extract_pruned_result(res, i)
366
+
367
+ # Ensure dimensions are set
368
+ if not pruned_result.get("width"):
369
+ pruned_result["width"] = img_width
370
+ if not pruned_result.get("height"):
371
+ pruned_result["height"] = img_height
372
+
373
+ # --- Build page result ---
374
+ page_result = {
375
+ "prunedResult": pruned_result,
376
+ "markdown": {
377
+ "text": md_text,
378
+ "images": md_images
379
+ },
380
  "outputImages": output_images,
381
+ "inputImage": input_image_url
382
+ }
383
+
384
+ layout_parsing_results.append(page_result)
385
+ preprocessed_images.append(input_image_url)
386
+ data_info_pages.append({
387
+ "width": img_width,
388
+ "height": img_height
389
  })
390
 
391
  return {
392
  "errorCode": 0,
393
  "result": {
394
+ "layoutParsingResults": layout_parsing_results if layout_parsing_results else [{
395
+ "prunedResult": {
396
+ "page_count": 0,
397
+ "width": 0,
398
+ "height": 0,
399
+ "parsing_res_list": [],
400
+ "layout_det_res": {"boxes": []}
401
+ },
402
  "markdown": {"text": "", "images": {}},
403
+ "outputImages": {},
404
+ "inputImage": ""
405
+ }],
406
+ "preprocessedImages": preprocessed_images,
407
+ "dataInfo": {
408
+ "type": "image",
409
+ "numPages": len(layout_parsing_results),
410
+ "pages": data_info_pages
411
+ }
412
  }
413
  }
414
+
415
  finally:
416
  if os.path.exists(tmp_path):
417
  os.unlink(tmp_path)
418
 
419
 
420
+ def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
421
+ """Element-level recognition via direct vLLM call."""
422
+ if file_data.startswith(("http://", "https://")):
423
+ image_url = file_data
424
+ else:
425
+ image_url = f"data:image/png;base64,{file_data}"
426
+
427
+ task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
428
+
429
+ response = openai_client.chat.completions.create(
430
+ model=VLLM_MODEL_NAME,
431
+ messages=[{
432
+ "role": "user",
433
+ "content": [
434
+ {"type": "image_url", "image_url": {"url": image_url}},
435
+ {"type": "text", "text": task_prompt}
436
+ ]
437
+ }],
438
+ temperature=0.0
439
+ )
440
+
441
+ result_text = response.choices[0].message.content
442
+
443
+ return {
444
+ "errorCode": 0,
445
+ "result": {
446
+ "layoutParsingResults": [{
447
+ "prunedResult": {
448
+ "page_count": 1,
449
+ "width": 0,
450
+ "height": 0,
451
+ "parsing_res_list": [{
452
+ "block_label": prompt_label,
453
+ "block_content": result_text,
454
+ "block_bbox": [],
455
+ "block_id": 0,
456
+ "block_order": 0,
457
+ "group_id": 0,
458
+ "global_block_id": 0,
459
+ "global_group_id": 0,
460
+ "block_polygon_points": []
461
+ }],
462
+ "layout_det_res": {"boxes": []}
463
+ },
464
+ "markdown": {"text": result_text, "images": {}},
465
+ "outputImages": {},
466
+ "prunedResult.spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
467
+ }]
468
+ }
469
+ }
470
+
471
+
472
  def _parse_spotting(text: str) -> dict:
473
  try:
474
  return json.loads(text)
 
498
  async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
499
  """
500
  Main OCR endpoint — compatible with the Gradio app.
501
+ Returns full JSON matching official Baidu API format.
502
 
503
  Body:
504
  {