heerjtdev commited on
Commit
4d661b4
·
verified ·
1 Parent(s): d06d1bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +702 -164
app.py CHANGED
@@ -1,21 +1,599 @@
1
 
2
- import base64
3
- from PIL import Image
4
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
6
 
 
 
 
 
 
 
7
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import fitz # PyMuPDF
12
  import numpy as np
13
  import cv2
14
  import torch
15
  import torch.serialization
16
  import os
17
- import time
18
- from typing import Optional, Tuple, List, Dict, Any
19
  from ultralytics import YOLO
20
  import logging
21
  import gradio as gr
@@ -40,29 +618,23 @@ logging.basicConfig(level=logging.WARNING)
40
  # --- CONFIGURATION AND CONSTANTS ---
41
  # ============================================================================
42
 
43
- WEIGHTS_PATH = 'best.pt'
44
- SCALE_FACTOR = 2.0
45
- # OUTPUT_DIR = "yolo_extracted_regions"
46
- # OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
47
 
 
48
  from transformers import TrOCRProcessor
49
  from optimum.onnxruntime import ORTModelForVision2Seq
50
 
51
-
52
-
53
  MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
54
- processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
55
- ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
56
-
57
-
58
-
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
 
67
  # Detection parameters
68
  CONF_THRESHOLD = 0.2
@@ -70,12 +642,13 @@ TARGET_CLASSES = ['figure', 'equation']
70
  IOU_MERGE_THRESHOLD = 0.4
71
  IOA_SUPPRESSION_THRESHOLD = 0.7
72
 
73
- # Global counters (Reset per run)
74
- GLOBAL_FIGURE_COUNT = 0
75
- GLOBAL_EQUATION_COUNT = 0
 
76
 
77
  # ============================================================================
78
- # --- BOX COMBINATION LOGIC (Retained for detection accuracy) ---
79
  # ============================================================================
80
 
81
  def calculate_iou(box1, box2):
@@ -136,7 +709,7 @@ def merge_overlapping_boxes(detections, iou_threshold):
136
  merged_x1 = min(merged_x1, other_box[0])
137
  merged_y1 = min(merged_y1, other_box[1])
138
  merged_x2 = max(merged_x2, other_box[2])
139
- merged_y2 = max(merged_y2, other_box[3])
140
  is_merged[j] = True
141
  merged_detections.append({
142
  'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
@@ -160,18 +733,46 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
160
  return img
161
 
162
 
 
 
 
163
 
 
 
 
 
164
 
165
- def run_yolo_detection_and_count(
166
- image: np.ndarray, model: YOLO, page_num: int
167
- ) -> Tuple[int, int, List[Dict[str, str]]]:
 
168
 
169
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
170
 
171
- yolo_detections = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  page_equations = 0
173
  page_figures = 0
174
  detected_items = []
 
175
 
176
  try:
177
  results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
@@ -189,7 +790,7 @@ def run_yolo_detection_and_count(
189
  })
190
  except Exception as e:
191
  logging.error(f"YOLO inference failed on page {page_num}: {e}")
192
- return 0, 0, []
193
 
194
  merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
195
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
@@ -198,31 +799,34 @@ def run_yolo_detection_and_count(
198
  bbox = det["coords"]
199
 
200
  if det["class"] == "equation":
201
- GLOBAL_EQUATION_COUNT += 1
202
  page_equations += 1
203
 
204
  b64 = crop_and_convert_to_base64(image, bbox)
205
  detected_items.append({
206
  "type": "equation",
207
- "id": f"EQUATION{GLOBAL_EQUATION_COUNT}",
208
  "base64": b64
209
  })
210
 
211
  elif det["class"] == "figure":
212
- GLOBAL_FIGURE_COUNT += 1
213
  page_figures += 1
214
 
215
  b64 = crop_and_convert_to_base64(image, bbox)
216
  detected_items.append({
217
  "type": "figure",
218
- "id": f"FIGURE{GLOBAL_FIGURE_COUNT}",
219
  "base64": b64
220
  })
221
 
222
  logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
223
- return page_equations, page_figures, detected_items
 
224
 
225
 
 
 
226
 
227
  def get_latex_from_base64(base64_string: str) -> str:
228
  if ort_model is None or processor is None:
@@ -248,44 +852,6 @@ def get_latex_from_base64(base64_string: str) -> str:
248
  return f"[TR_OCR_ERROR: {e}]"
249
 
250
 
251
-
252
-
253
-
254
-
255
-
256
-
257
-
258
-
259
-
260
-
261
-
262
-
263
-
264
-
265
- def extract_images_from_page_in_memory(page) -> Dict[str, str]:
266
- """
267
- Extract images from a page and return:
268
- { "EQUATION1": base64_string, "FIGURE1": base64_string }
269
- """
270
- image_map = {}
271
- image_list = page.get_images(full=True)
272
-
273
- for idx, img in enumerate(image_list, start=1):
274
- xref = img[0]
275
- base = page.parent.extract_image(xref)
276
- image_bytes = base["image"]
277
-
278
- base64_img = base64.b64encode(image_bytes).decode("utf-8")
279
-
280
- # Convention: first image = FIGURE1, second image = EQUATION1 etc
281
- # You can tune this if needed
282
- image_map[f"FIGURE{idx}"] = base64_img
283
-
284
- return image_map
285
-
286
-
287
-
288
-
289
  def embed_images_as_base64_in_memory(structured_data, detected_items):
290
  tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
291
 
@@ -326,78 +892,40 @@ def embed_images_as_base64_in_memory(structured_data, detected_items):
326
  final_data.append(item)
327
 
328
  return final_data
329
-
330
-
331
-
332
-
333
-
334
-
335
- def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
336
- x1, y1, x2, y2 = map(int, bbox)
337
- h, w, _ = image.shape
338
-
339
- x1 = max(0, x1)
340
- y1 = max(0, y1)
341
- x2 = min(w, x2)
342
- y2 = min(h, y2)
343
-
344
- crop = image[y1:y2, x1:x2]
345
- _, buffer = cv2.imencode(".png", crop)
346
-
347
- return base64.b64encode(buffer).decode("utf-8")
348
-
349
-
350
-
351
-
352
-
353
-
354
-
355
-
356
-
357
-
358
-
359
-
360
-
361
-
362
-
363
-
364
-
365
 
366
  # ============================================================================
367
- # --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for JSON serialization) ---
368
  # ============================================================================
369
 
370
- # NOTE: The return signature now uses Dict[str, int] for the equation counts
371
- def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, float, Dict[str, int], List[str]]:
 
 
372
  """
373
- Runs the pipeline, returns counts, report, total time, page counts dict (str keys), and empty list.
 
374
  """
375
 
376
-
377
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
378
  start_time = time.time()
379
  log_messages = []
380
- all_saved_images = []
381
- all_base64_images: List[str] = []
 
382
 
383
  # Dictionary to store {page_number (int): equation_count (int)}
384
  equation_counts_per_page: Dict[int, int] = {}
385
 
386
- # Reset globals
387
- GLOBAL_FIGURE_COUNT = 0
388
- GLOBAL_EQUATION_COUNT = 0
389
-
390
-
391
-
392
- # if os.path.exists(OUTPUT_DIR):
393
- # shutil.rmtree(OUTPUT_DIR)
394
- # os.makedirs(OUTPUT_DIR, exist_ok=True)
395
 
396
 
397
  # 1. Validation and Model Loading
398
  t0 = time.time()
399
  if not os.path.exists(pdf_path):
400
  report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
 
401
  return 0, 0, 0, report, time.time() - start_time, {}, []
402
 
403
  try:
@@ -442,10 +970,26 @@ def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, flo
442
 
443
  # Core Detection
444
  detect_start = time.time()
445
- # page_equations, _ = run_yolo_detection_and_count(original_img, model, page_num)
446
- page_equations, _, page_images = run_yolo_detection_and_count(original_img, model, page_num)
447
- all_saved_images.extend(page_images)
 
 
 
 
 
 
 
 
 
 
 
448
 
 
 
 
 
 
449
  detect_time = time.time() - detect_start
450
 
451
  # Store the count in the dictionary (INT keys)
@@ -459,7 +1003,7 @@ def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, flo
459
  detection_loop_time = t5 - t4
460
  log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
461
 
462
- # FIX APPLIED HERE: Convert integer keys to string keys for JSON serialization
463
  equation_counts_per_page_str_keys: Dict[str, int] = {
464
  str(k): v for k, v in equation_counts_per_page.items()
465
  }
@@ -470,8 +1014,8 @@ def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, flo
470
  report = (
471
  f"✅ **YOLO Counting Complete!**\n\n"
472
  f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
473
- f"**2) Total Equations Detected:** **{GLOBAL_EQUATION_COUNT}**\n"
474
- f"**3) Total Figures Detected:** **{GLOBAL_FIGURE_COUNT}**\n"
475
  f"---\n"
476
  f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
477
  f"### Detailed Step Timing\n"
@@ -480,45 +1024,46 @@ def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, flo
480
  f"\n```"
481
  )
482
 
483
- # Return the dictionary with string keys
484
- # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, []
485
- return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, all_saved_images
486
-
487
 
488
 
489
  # ============================================================================
490
  # --- GRADIO INTERFACE FUNCTION (Updated) ---
491
  # ============================================================================
492
 
493
- def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[str]]:
 
494
  """
495
  Gradio wrapper function to handle file upload and return results.
496
  """
497
  if pdf_file is None:
498
- # Return an empty dict with string keys
499
  return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
500
 
501
  pdf_path = pdf_file.name
502
 
503
  try:
504
  # Unpack the new return value: equation_counts_per_page (with string keys)
505
- # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, _ = run_single_pdf_preprocessing(
506
- # pdf_path
507
- # )
508
- # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, images = run_single_pdf_preprocessing(pdf_path)
509
- num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, images = run_single_pdf_preprocessing(pdf_path)
510
-
 
 
 
511
 
512
 
513
  # Return results (6 items now)
514
- # return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, []
515
- return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, images
516
 
517
 
518
  except Exception as e:
519
  error_msg = f"An unexpected error occurred: {e}"
520
  logging.error(error_msg, exc_info=True)
521
- # Return an empty dict on error
522
  return "Error", "Error", "Error", error_msg, {}, []
523
 
524
 
@@ -542,9 +1087,9 @@ if __name__ == "__main__":
542
  # NEW OUTPUT: JSON component for structured data
543
  output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
544
 
545
- # Gradio Gallery is retained but will receive an empty list []
546
  output_gallery = gr.Gallery(
547
- label="Detected Equations (Disabled for Speed)",
548
  columns=5,
549
  height="auto",
550
  object_fit="contain",
@@ -554,7 +1099,7 @@ if __name__ == "__main__":
554
  interface = gr.Interface(
555
  fn=gradio_process_pdf,
556
  inputs=input_file,
557
- # Outputs list remains the same, but the JSON component now receives string keys.
558
  outputs=[
559
  output_pages,
560
  output_equations,
@@ -563,18 +1108,11 @@ if __name__ == "__main__":
563
  output_page_counts,
564
  output_gallery
565
  ],
566
- title="📊 YOLO Counting with Per-Page Data & Timing",
567
  description=(
568
- "Upload a PDF to run YOLO detection. The results include total counts, a breakdown of "
569
- "equation counts per page (in JSON format), and detailed timing."
570
  ),
571
  )
572
 
573
  print("\nStarting Gradio application...")
574
- # interface.launch(inbrowser=True)
575
- interface.launch(
576
- inbrowser=True,
577
- # allowed_paths=[OUTPUT_DIR]
578
- )
579
-
580
-
 
1
 
2
+ # import base64
3
+ # from PIL import Image
4
+ # import re
5
+
6
+
7
+
8
+
9
+
10
+
11
+ # import fitz # PyMuPDF
12
+ # import numpy as np
13
+ # import cv2
14
+ # import torch
15
+ # import torch.serialization
16
+ # import os
17
+ # import time
18
+ # from typing import Optional, Tuple, List, Dict, Any
19
+ # from ultralytics import YOLO
20
+ # import logging
21
+ # import gradio as gr
22
+ # import shutil
23
+ # import tempfile
24
+ # import io
25
+
26
+ # # ============================================================================
27
+ # # --- Global Patches and Setup ---
28
+ # # ============================================================================
29
+
30
+ # # Patch torch.load to prevent weights_only error with older models
31
+ # _original_torch_load = torch.load
32
+ # def patched_torch_load(*args, **kwargs):
33
+ # kwargs["weights_only"] = False
34
+ # return _original_torch_load(*args, **kwargs)
35
+ # torch.load = patched_torch_load
36
+
37
+ # logging.basicConfig(level=logging.WARNING)
38
+
39
+ # # ============================================================================
40
+ # # --- CONFIGURATION AND CONSTANTS ---
41
+ # # ============================================================================
42
+
43
+ # WEIGHTS_PATH = 'best.pt'
44
+ # SCALE_FACTOR = 2.0
45
+ # # OUTPUT_DIR = "yolo_extracted_regions"
46
+ # # OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
47
+
48
+ # from transformers import TrOCRProcessor
49
+ # from optimum.onnxruntime import ORTModelForVision2Seq
50
+
51
+
52
+
53
+ # MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
54
+ # processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
55
+ # ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+ # # Detection parameters
68
+ # CONF_THRESHOLD = 0.2
69
+ # TARGET_CLASSES = ['figure', 'equation']
70
+ # IOU_MERGE_THRESHOLD = 0.4
71
+ # IOA_SUPPRESSION_THRESHOLD = 0.7
72
+
73
+ # # Global counters (Reset per run)
74
+ # GLOBAL_FIGURE_COUNT = 0
75
+ # GLOBAL_EQUATION_COUNT = 0
76
+
77
+ # # ============================================================================
78
+ # # --- BOX COMBINATION LOGIC (Retained for detection accuracy) ---
79
+ # # ============================================================================
80
+
81
+ # def calculate_iou(box1, box2):
82
+ # x1_a, y1_a, x2_a, y2_a = box1
83
+ # x1_b, y1_b, x2_b, y2_b = box2
84
+ # x_left = max(x1_a, x1_b)
85
+ # y_top = max(y1_a, y1_b)
86
+ # x_right = min(x2_a, x2_b)
87
+ # y_bottom = min(y2_a, y2_b)
88
+ # intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)
89
+ # box_a_area = (x2_a - x1_a) * (y2_a - y1_a)
90
+ # box_b_area = (x2_b - x1_b) * (y2_b - y1_b)
91
+ # union_area = float(box_a_area + box_b_area - intersection_area)
92
+ # return intersection_area / union_area if union_area > 0 else 0
93
+
94
+
95
+ # def filter_nested_boxes(detections, ioa_threshold=0.80):
96
+ # if not detections: return []
97
+ # for d in detections:
98
+ # x1, y1, x2, y2 = d['coords']
99
+ # d['area'] = (x2 - x1) * (y2 - y1)
100
+ # detections.sort(key=lambda x: x['area'], reverse=True)
101
+ # keep_indices = []
102
+ # is_suppressed = [False] * len(detections)
103
+ # for i in range(len(detections)):
104
+ # if is_suppressed[i]: continue
105
+ # keep_indices.append(i)
106
+ # box_a = detections[i]['coords']
107
+ # for j in range(i + 1, len(detections)):
108
+ # if is_suppressed[j]: continue
109
+ # box_b = detections[j]['coords']
110
+ # x_left = max(box_a[0], box_b[0])
111
+ # y_top = max(box_a[1], box_b[1])
112
+ # x_right = min(box_a[2], box_b[2])
113
+ # y_bottom = min(box_a[3], box_b[3])
114
+ # intersection = max(0, x_right - x_left) * max(0, y_bottom - y_top)
115
+ # area_b = detections[j]['area']
116
+ # if area_b > 0 and intersection / area_b > ioa_threshold:
117
+ # is_suppressed[j] = True
118
+ # return [detections[i] for i in keep_indices]
119
+
120
+
121
+ # def merge_overlapping_boxes(detections, iou_threshold):
122
+ # if not detections: return []
123
+ # detections.sort(key=lambda d: d['conf'], reverse=True)
124
+ # merged_detections = []
125
+ # is_merged = [False] * len(detections)
126
+ # for i in range(len(detections)):
127
+ # if is_merged[i]: continue
128
+ # current_box = detections[i]['coords']
129
+ # current_class = detections[i]['class']
130
+ # merged_x1, merged_y1, merged_x2, merged_y2 = current_box
131
+ # for j in range(i + 1, len(detections)):
132
+ # if is_merged[j] or detections[j]['class'] != current_class: continue
133
+ # other_box = detections[j]['coords']
134
+ # iou = calculate_iou(current_box, other_box)
135
+ # if iou > iou_threshold:
136
+ # merged_x1 = min(merged_x1, other_box[0])
137
+ # merged_y1 = min(merged_y1, other_box[1])
138
+ # merged_x2 = max(merged_x2, other_box[2])
139
+ # merged_y2 = max(merged_y2, other_box[3])
140
+ # is_merged[j] = True
141
+ # merged_detections.append({
142
+ # 'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
143
+ # 'y1': merged_y1, 'class': current_class, 'conf': detections[i]['conf']
144
+ # })
145
+ # return merged_detections
146
+
147
+ # # ============================================================================
148
+ # # --- UTILITY FUNCTIONS ---
149
+ # # ============================================================================
150
+
151
+ # def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
152
+ # """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
153
+ # img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
154
+ # (pix.h, pix.w, pix.n)
155
+ # )
156
+ # if pix.n == 4:
157
+ # img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
158
+ # elif pix.n == 1:
159
+ # img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
160
+ # return img
161
+
162
+
163
+
164
+
165
+ # def run_yolo_detection_and_count(
166
+ # image: np.ndarray, model: YOLO, page_num: int
167
+ # ) -> Tuple[int, int, List[Dict[str, str]]]:
168
+
169
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
170
+
171
+ # yolo_detections = []
172
+ # page_equations = 0
173
+ # page_figures = 0
174
+ # detected_items = []
175
+
176
+ # try:
177
+ # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
178
+
179
+ # if results and results[0].boxes:
180
+ # for box in results[0].boxes.data.tolist():
181
+ # x1, y1, x2, y2, conf, cls_id = box
182
+ # cls_name = model.names[int(cls_id)]
183
+
184
+ # if cls_name in TARGET_CLASSES:
185
+ # yolo_detections.append({
186
+ # 'coords': (x1, y1, x2, y2),
187
+ # 'class': cls_name,
188
+ # 'conf': conf
189
+ # })
190
+ # except Exception as e:
191
+ # logging.error(f"YOLO inference failed on page {page_num}: {e}")
192
+ # return 0, 0, []
193
+
194
+ # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
195
+ # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
196
+
197
+ # for det in final_detections:
198
+ # bbox = det["coords"]
199
+
200
+ # if det["class"] == "equation":
201
+ # GLOBAL_EQUATION_COUNT += 1
202
+ # page_equations += 1
203
+
204
+ # b64 = crop_and_convert_to_base64(image, bbox)
205
+ # detected_items.append({
206
+ # "type": "equation",
207
+ # "id": f"EQUATION{GLOBAL_EQUATION_COUNT}",
208
+ # "base64": b64
209
+ # })
210
+
211
+ # elif det["class"] == "figure":
212
+ # GLOBAL_FIGURE_COUNT += 1
213
+ # page_figures += 1
214
+
215
+ # b64 = crop_and_convert_to_base64(image, bbox)
216
+ # detected_items.append({
217
+ # "type": "figure",
218
+ # "id": f"FIGURE{GLOBAL_FIGURE_COUNT}",
219
+ # "base64": b64
220
+ # })
221
+
222
+ # logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
223
+ # return page_equations, page_figures, detected_items
224
+
225
+
226
+
227
+ # def get_latex_from_base64(base64_string: str) -> str:
228
+ # if ort_model is None or processor is None:
229
+ # return "[MODEL_ERROR: Model not initialized]"
230
+
231
+ # try:
232
+ # image_data = base64.b64decode(base64_string)
233
+ # image = Image.open(io.BytesIO(image_data)).convert('RGB')
234
+
235
+ # pixel_values = processor(images=image, return_tensors="pt").pixel_values
236
+ # generated_ids = ort_model.generate(pixel_values)
237
+ # raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
238
+
239
+ # if not raw_text:
240
+ # return "[OCR_WARNING: No formula found]"
241
+
242
+ # latex = raw_text[0]
243
+ # latex = re.sub(r'[\r\n]+', '', latex)
244
+
245
+ # return latex
246
+
247
+ # except Exception as e:
248
+ # return f"[TR_OCR_ERROR: {e}]"
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+ # def extract_images_from_page_in_memory(page) -> Dict[str, str]:
266
+ # """
267
+ # Extract images from a page and return:
268
+ # { "EQUATION1": base64_string, "FIGURE1": base64_string }
269
+ # """
270
+ # image_map = {}
271
+ # image_list = page.get_images(full=True)
272
+
273
+ # for idx, img in enumerate(image_list, start=1):
274
+ # xref = img[0]
275
+ # base = page.parent.extract_image(xref)
276
+ # image_bytes = base["image"]
277
+
278
+ # base64_img = base64.b64encode(image_bytes).decode("utf-8")
279
+
280
+ # # Convention: first image = FIGURE1, second image = EQUATION1 etc
281
+ # # You can tune this if needed
282
+ # image_map[f"FIGURE{idx}"] = base64_img
283
+
284
+ # return image_map
285
+
286
+
287
+
288
+
289
+ # def embed_images_as_base64_in_memory(structured_data, detected_items):
290
+ # tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
291
 
292
+ # item_lookup = {d["id"]: d for d in detected_items}
293
+ # final_data = []
294
 
295
+ # for item in structured_data:
296
+ # text_fields = [
297
+ # item.get('question', ''),
298
+ # item.get('passage', ''),
299
+ # item.get('new_passage', '')
300
+ # ]
301
 
302
+ # if 'options' in item:
303
+ # text_fields.extend(item['options'].values())
304
 
305
+ # used_tags = set()
306
+
307
+ # for text in text_fields:
308
+ # for m in tag_regex.finditer(text or ""):
309
+ # used_tags.add(m.group(0).upper())
310
+
311
+ # for tag in used_tags:
312
+ # base_key = tag.lower().replace(" ", "")
313
+
314
+ # if tag not in item_lookup:
315
+ # item[base_key] = "[MISSING_IMAGE]"
316
+ # continue
317
+
318
+ # entry = item_lookup[tag]
319
+
320
+ # if entry["type"] == "equation":
321
+ # item[base_key] = get_latex_from_base64(entry["base64"])
322
+
323
+ # else:
324
+ # item[base_key] = entry["base64"]
325
+
326
+ # final_data.append(item)
327
+
328
+ # return final_data
329
+
330
+
331
+
332
+
333
+
334
+
335
+ # def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
336
+ # x1, y1, x2, y2 = map(int, bbox)
337
+ # h, w, _ = image.shape
338
+
339
+ # x1 = max(0, x1)
340
+ # y1 = max(0, y1)
341
+ # x2 = min(w, x2)
342
+ # y2 = min(h, y2)
343
+
344
+ # crop = image[y1:y2, x1:x2]
345
+ # _, buffer = cv2.imencode(".png", crop)
346
+
347
+ # return base64.b64encode(buffer).decode("utf-8")
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+ # # ============================================================================
367
+ # # --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for JSON serialization) ---
368
+ # # ============================================================================
369
+
370
+ # # NOTE: The return signature now uses Dict[str, int] for the equation counts
371
+ # def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, float, Dict[str, int], List[str]]:
372
+ # """
373
+ # Runs the pipeline, returns counts, report, total time, page counts dict (str keys), and empty list.
374
+ # """
375
+
376
+
377
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
378
+ # start_time = time.time()
379
+ # log_messages = []
380
+ # all_saved_images = []
381
+ # all_base64_images: List[str] = []
382
+
383
+ # # Dictionary to store {page_number (int): equation_count (int)}
384
+ # equation_counts_per_page: Dict[int, int] = {}
385
+
386
+ # # Reset globals
387
+ # GLOBAL_FIGURE_COUNT = 0
388
+ # GLOBAL_EQUATION_COUNT = 0
389
+
390
+
391
+
392
+ # # if os.path.exists(OUTPUT_DIR):
393
+ # # shutil.rmtree(OUTPUT_DIR)
394
+ # # os.makedirs(OUTPUT_DIR, exist_ok=True)
395
+
396
+
397
+ # # 1. Validation and Model Loading
398
+ # t0 = time.time()
399
+ # if not os.path.exists(pdf_path):
400
+ # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
401
+ # return 0, 0, 0, report, time.time() - start_time, {}, []
402
+
403
+ # try:
404
+ # model = YOLO(WEIGHTS_PATH)
405
+ # logging.warning(f"✅ Loaded YOLO model from: {WEIGHTS_PATH}")
406
+ # except Exception as e:
407
+ # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
408
+ # return 0, 0, 0, report, time.time() - start_time, {}, []
409
+ # t1 = time.time()
410
+ # log_messages.append(f"Model Loading Time: {t1-t0:.4f}s")
411
+
412
+ # # 2. PDF Loading
413
+ # t2 = time.time()
414
+ # try:
415
+ # doc = fitz.open(pdf_path)
416
+ # total_pages = doc.page_count
417
+ # logging.warning(f"✅ Opened PDF with {doc.page_count} pages")
418
+ # except Exception as e:
419
+ # report = f"❌ ERROR loading PDF file: {e}"
420
+ # return 0, 0, 0, report, time.time() - start_time, {}, []
421
+ # t3 = time.time()
422
+ # log_messages.append(f"PDF Initialization Time: {t3-t2:.4f}s")
423
+
424
+ # mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
425
+
426
+ # # 3. Page Processing and Detection Loop
427
+ # t4 = time.time()
428
+ # for page_num_0_based in range(doc.page_count):
429
+ # page_start_time = time.time()
430
+ # fitz_page = doc.load_page(page_num_0_based)
431
+ # page_num = page_num_0_based + 1
432
+
433
+ # # Render page to image for YOLO
434
+ # try:
435
+ # pix_start = time.time()
436
+ # pix = fitz_page.get_pixmap(matrix=mat)
437
+ # original_img = pixmap_to_numpy(pix)
438
+ # pix_time = time.time() - pix_start
439
+ # except Exception as e:
440
+ # logging.error(f"Error converting page {page_num} to image: {e}. Skipping.")
441
+ # continue
442
+
443
+ # # Core Detection
444
+ # detect_start = time.time()
445
+ # # page_equations, _ = run_yolo_detection_and_count(original_img, model, page_num)
446
+ # page_equations, _, page_images = run_yolo_detection_and_count(original_img, model, page_num)
447
+ # all_saved_images.extend(page_images)
448
+
449
+ # detect_time = time.time() - detect_start
450
+
451
+ # # Store the count in the dictionary (INT keys)
452
+ # equation_counts_per_page[page_num] = page_equations
453
+
454
+ # page_total_time = time.time() - page_start_time
455
+ # log_messages.append(f"Page {page_num} Time: Total={page_total_time:.4f}s (Render={pix_time:.4f}s, Detect={detect_time:.4f}s)")
456
+
457
+ # doc.close()
458
+ # t5 = time.time()
459
+ # detection_loop_time = t5 - t4
460
+ # log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
461
+
462
+ # # FIX APPLIED HERE: Convert integer keys to string keys for JSON serialization
463
+ # equation_counts_per_page_str_keys: Dict[str, int] = {
464
+ # str(k): v for k, v in equation_counts_per_page.items()
465
+ # }
466
+
467
+ # # 4. Final Report Generation
468
+ # total_execution_time = t5 - start_time
469
+
470
+ # report = (
471
+ # f"✅ **YOLO Counting Complete!**\n\n"
472
+ # f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
473
+ # f"**2) Total Equations Detected:** **{GLOBAL_EQUATION_COUNT}**\n"
474
+ # f"**3) Total Figures Detected:** **{GLOBAL_FIGURE_COUNT}**\n"
475
+ # f"---\n"
476
+ # f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
477
+ # f"### Detailed Step Timing\n"
478
+ # f"```\n"
479
+ # + "\n".join(log_messages) +
480
+ # f"\n```"
481
+ # )
482
+
483
+ # # Return the dictionary with string keys
484
+ # # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, []
485
+ # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, all_saved_images
486
+
487
+
488
+
489
+ # # ============================================================================
490
+ # # --- GRADIO INTERFACE FUNCTION (Updated) ---
491
+ # # ============================================================================
492
+
493
+ # def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[str]]:
494
+ # """
495
+ # Gradio wrapper function to handle file upload and return results.
496
+ # """
497
+ # if pdf_file is None:
498
+ # # Return an empty dict with string keys
499
+ # return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
500
+
501
+ # pdf_path = pdf_file.name
502
+
503
+ # try:
504
+ # # Unpack the new return value: equation_counts_per_page (with string keys)
505
+ # # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, _ = run_single_pdf_preprocessing(
506
+ # # pdf_path
507
+ # # )
508
+ # # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, images = run_single_pdf_preprocessing(pdf_path)
509
+ # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, images = run_single_pdf_preprocessing(pdf_path)
510
+
511
+
512
+
513
+ # # Return results (6 items now)
514
+ # # return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, []
515
+ # return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, images
516
+
517
+
518
+ # except Exception as e:
519
+ # error_msg = f"An unexpected error occurred: {e}"
520
+ # logging.error(error_msg, exc_info=True)
521
+ # # Return an empty dict on error
522
+ # return "Error", "Error", "Error", error_msg, {}, []
523
 
524
 
525
+ # # ============================================================================
526
+ # # --- GRADIO INTERFACE DEFINITION (Updated) ---
527
+ # # ============================================================================
528
+
529
+ # if __name__ == "__main__":
530
+
531
+ # if not os.path.exists(WEIGHTS_PATH):
532
+ # logging.error(f"❌ FATAL ERROR: YOLO weight file '{WEIGHTS_PATH}' not found. Cannot run live inference.")
533
+
534
+ # input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
535
+
536
+ # # Outputs
537
+ # output_pages = gr.Textbox(label="Total Pages in PDF", interactive=False)
538
+ # output_equations = gr.Textbox(label="Total Equations Detected", interactive=False)
539
+ # output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
540
+ # output_report = gr.Markdown(label="Processing Summary and Timing")
541
+
542
+ # # NEW OUTPUT: JSON component for structured data
543
+ # output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
544
+
545
+ # # Gradio Gallery is retained but will receive an empty list []
546
+ # output_gallery = gr.Gallery(
547
+ # label="Detected Equations (Disabled for Speed)",
548
+ # columns=5,
549
+ # height="auto",
550
+ # object_fit="contain",
551
+ # allow_preview=False
552
+ # )
553
+
554
+ # interface = gr.Interface(
555
+ # fn=gradio_process_pdf,
556
+ # inputs=input_file,
557
+ # # Outputs list remains the same, but the JSON component now receives string keys.
558
+ # outputs=[
559
+ # output_pages,
560
+ # output_equations,
561
+ # output_figures,
562
+ # output_report,
563
+ # output_page_counts,
564
+ # output_gallery
565
+ # ],
566
+ # title="📊 YOLO Counting with Per-Page Data & Timing",
567
+ # description=(
568
+ # "Upload a PDF to run YOLO detection. The results include total counts, a breakdown of "
569
+ # "equation counts per page (in JSON format), and detailed timing."
570
+ # ),
571
+ # )
572
+
573
+ # print("\nStarting Gradio application...")
574
+ # # interface.launch(inbrowser=True)
575
+ # interface.launch(
576
+ # inbrowser=True,
577
+ # # allowed_paths=[OUTPUT_DIR]
578
+ # )
579
+
580
+
581
+
582
+
583
+
584
+
585
+
586
+ import base64
587
+ from PIL import Image
588
+ import re
589
  import fitz # PyMuPDF
590
  import numpy as np
591
  import cv2
592
  import torch
593
  import torch.serialization
594
  import os
595
+ import time
596
+ from typing import Optional, Tuple, List, Dict, Any, Union
597
  from ultralytics import YOLO
598
  import logging
599
  import gradio as gr
 
618
  # --- CONFIGURATION AND CONSTANTS ---
619
  # ============================================================================
620
 
621
+ WEIGHTS_PATH = 'best.pt'
622
+ SCALE_FACTOR = 2.0
 
 
623
 
624
+ # --- OCR Model Initialization (Retained but not used in the main loop for counting) ---
625
  from transformers import TrOCRProcessor
626
  from optimum.onnxruntime import ORTModelForVision2Seq
627
 
 
 
628
  MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
629
+ # Note: These models are kept global but unused in the main flow,
630
+ # as the user did not explicitly ask to remove the heavy OCR dependency yet.
631
+ try:
632
+ processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
633
+ ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
634
+ except Exception as e:
635
+ logging.warning(f"OCR model loading failed (expected if dependencies are missing): {e}")
636
+ processor = None
637
+ ort_model = None
 
 
 
638
 
639
  # Detection parameters
640
  CONF_THRESHOLD = 0.2
 
642
  IOU_MERGE_THRESHOLD = 0.4
643
  IOA_SUPPRESSION_THRESHOLD = 0.7
644
 
645
+ # --- REMOVED GLOBAL COUNTERS ---
646
+ # GLOBAL_FIGURE_COUNT = 0
647
+ # GLOBAL_EQUATION_COUNT = 0
648
+
649
 
650
  # ============================================================================
651
+ # --- BOX COMBINATION LOGIC (Retained) ---
652
  # ============================================================================
653
 
654
  def calculate_iou(box1, box2):
 
709
  merged_x1 = min(merged_x1, other_box[0])
710
  merged_y1 = min(merged_y1, other_box[1])
711
  merged_x2 = max(merged_x2, other_box[2])
712
+ merged_y2 = max(other_box[3], other_box[3])
713
  is_merged[j] = True
714
  merged_detections.append({
715
  'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
 
733
  return img
734
 
735
 
736
+ def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
737
+ x1, y1, x2, y2 = map(int, bbox)
738
+ h, w, _ = image.shape
739
 
740
+ x1 = max(0, x1)
741
+ y1 = max(0, y1)
742
+ x2 = min(w, x2)
743
+ y2 = min(h, y2)
744
 
745
+ crop = image[y1:y2, x1:x2]
746
+ _, buffer = cv2.imencode(".png", crop)
747
+
748
+ return base64.b64encode(buffer).decode("utf-8")
749
 
 
750
 
751
+ # --- NEW: Function to format base64 for Gradio Gallery ---
752
+ def base64_to_gradio_gallery_tuple(base64_str: str, label: str) -> Tuple[str, str]:
753
+ """Converts raw base64 to a data URI tuple for Gradio Gallery."""
754
+ # Format: ('data:image/png;base64,...', 'label')
755
+ return (f"data:image/png;base64,{base64_str}", label)
756
+
757
+
758
+ # --- UPDATED: run_yolo_detection_and_count to use passed counters ---
759
+ def run_yolo_detection_and_count(
760
+ image: np.ndarray, model: YOLO, page_num: int,
761
+ current_eq_count: int, current_fig_count: int
762
+ ) -> Tuple[int, int, List[Dict[str, str]], int, int]:
763
+ """
764
+ Performs YOLO detection and returns page counts, detected items,
765
+ and the updated global counters.
766
+ """
767
+
768
+ # Use the passed counters as starting points for this page
769
+ eq_counter = current_eq_count
770
+ fig_counter = current_fig_count
771
+
772
  page_equations = 0
773
  page_figures = 0
774
  detected_items = []
775
+ yolo_detections = []
776
 
777
  try:
778
  results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
 
790
  })
791
  except Exception as e:
792
  logging.error(f"YOLO inference failed on page {page_num}: {e}")
793
+ return 0, 0, [], eq_counter, fig_counter
794
 
795
  merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
796
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
 
799
  bbox = det["coords"]
800
 
801
  if det["class"] == "equation":
802
+ eq_counter += 1
803
  page_equations += 1
804
 
805
  b64 = crop_and_convert_to_base64(image, bbox)
806
  detected_items.append({
807
  "type": "equation",
808
+ "id": f"EQUATION{eq_counter}",
809
  "base64": b64
810
  })
811
 
812
  elif det["class"] == "figure":
813
+ fig_counter += 1
814
  page_figures += 1
815
 
816
  b64 = crop_and_convert_to_base64(image, bbox)
817
  detected_items.append({
818
  "type": "figure",
819
+ "id": f"FIGURE{fig_counter}",
820
  "base64": b64
821
  })
822
 
823
  logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
824
+ # Return page counts, detected items, and the UPDATED total counters
825
+ return page_equations, page_figures, detected_items, eq_counter, fig_counter
826
 
827
 
828
+ # --- Other unused functions (get_latex_from_base64, etc.) are kept but not modified as
829
+ # the focus is on the concurrency and Gradio Gallery fix. ---
830
 
831
  def get_latex_from_base64(base64_string: str) -> str:
832
  if ort_model is None or processor is None:
 
852
  return f"[TR_OCR_ERROR: {e}]"
853
 
854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  def embed_images_as_base64_in_memory(structured_data, detected_items):
856
  tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
857
 
 
892
  final_data.append(item)
893
 
894
  return final_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
 
896
  # ============================================================================
897
+ # --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for concurrency) ---
898
  # ============================================================================
899
 
900
+ # --- UPDATED return type for clarity ---
901
+ def run_single_pdf_preprocessing(
902
+ pdf_path: str
903
+ ) -> Tuple[int, int, int, str, float, Dict[str, int], List[Tuple[str, str]]]:
904
  """
905
+ Runs the pipeline, returns counts, report, total time, page counts dict (str keys),
906
+ and a list of (image_data_uri, label) for the Gradio gallery.
907
  """
908
 
909
+ # --- INITIALIZE LOCAL COUNTERS ---
 
910
  start_time = time.time()
911
  log_messages = []
912
+
913
+ # This list now holds (data_uri, label) tuples for Gradio
914
+ all_gradio_gallery_items: List[Tuple[str, str]] = []
915
 
916
  # Dictionary to store {page_number (int): equation_count (int)}
917
  equation_counts_per_page: Dict[int, int] = {}
918
 
919
+ # --- USE LOCAL COUNTERS FOR THREAD SAFETY ---
920
+ total_figure_count = 0
921
+ total_equation_count = 0
 
 
 
 
 
 
922
 
923
 
924
  # 1. Validation and Model Loading
925
  t0 = time.time()
926
  if not os.path.exists(pdf_path):
927
  report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
928
+ # Return empty list of tuples for gallery on error
929
  return 0, 0, 0, report, time.time() - start_time, {}, []
930
 
931
  try:
 
970
 
971
  # Core Detection
972
  detect_start = time.time()
973
+ # --- PASSING AND RECEIVING THE COUNTERS HERE (Concurrency Fix) ---
974
+ (
975
+ page_equations,
976
+ page_figures,
977
+ page_images_dicts,
978
+ total_equation_count,
979
+ total_figure_count
980
+ ) = run_yolo_detection_and_count(
981
+ original_img,
982
+ model,
983
+ page_num,
984
+ total_equation_count,
985
+ total_figure_count
986
+ )
987
 
988
+ # --- FORMATTING FOR GRADIO GALLERY (Gradio Format Fix) ---
989
+ for item in page_images_dicts:
990
+ gradio_tuple = base64_to_gradio_gallery_tuple(item["base64"], item["id"])
991
+ all_gradio_gallery_items.append(gradio_tuple)
992
+
993
  detect_time = time.time() - detect_start
994
 
995
  # Store the count in the dictionary (INT keys)
 
1003
  detection_loop_time = t5 - t4
1004
  log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
1005
 
1006
+ # Convert integer keys to string keys for JSON serialization
1007
  equation_counts_per_page_str_keys: Dict[str, int] = {
1008
  str(k): v for k, v in equation_counts_per_page.items()
1009
  }
 
1014
  report = (
1015
  f"✅ **YOLO Counting Complete!**\n\n"
1016
  f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
1017
+ f"**2) Total Equations Detected:** **{total_equation_count}**\n" # Uses local final count
1018
+ f"**3) Total Figures Detected:** **{total_figure_count}**\n" # Uses local final count
1019
  f"---\n"
1020
  f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
1021
  f"### Detailed Step Timing\n"
 
1024
  f"\n```"
1025
  )
1026
 
1027
+ # Return the dictionary with string keys and the properly formatted gallery items
1028
+ return total_pages, total_equation_count, total_figure_count, report, total_execution_time, equation_counts_per_page_str_keys, all_gradio_gallery_items
 
 
1029
 
1030
 
1031
  # ============================================================================
1032
  # --- GRADIO INTERFACE FUNCTION (Updated) ---
1033
  # ============================================================================
1034
 
1035
+ # --- UPDATED return type for clarity ---
1036
+ def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[Tuple[str, str]]]:
1037
  """
1038
  Gradio wrapper function to handle file upload and return results.
1039
  """
1040
  if pdf_file is None:
1041
+ # Return empty list of tuples for gallery on error
1042
  return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
1043
 
1044
  pdf_path = pdf_file.name
1045
 
1046
  try:
1047
  # Unpack the new return value: equation_counts_per_page (with string keys)
1048
+ (
1049
+ num_pages,
1050
+ num_equations,
1051
+ num_figures,
1052
+ report,
1053
+ total_time,
1054
+ equation_counts_per_page,
1055
+ gallery_items # Now correctly formatted list of tuples
1056
+ ) = run_single_pdf_preprocessing(pdf_path)
1057
 
1058
 
1059
  # Return results (6 items now)
1060
+ return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, gallery_items
 
1061
 
1062
 
1063
  except Exception as e:
1064
  error_msg = f"An unexpected error occurred: {e}"
1065
  logging.error(error_msg, exc_info=True)
1066
+ # Return empty list of tuples for gallery on error
1067
  return "Error", "Error", "Error", error_msg, {}, []
1068
 
1069
 
 
1087
  # NEW OUTPUT: JSON component for structured data
1088
  output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
1089
 
1090
+ # Gradio Gallery is retained and now receives the correctly formatted list of tuples
1091
  output_gallery = gr.Gallery(
1092
+ label="Detected Items (Gallery Format Fix Applied)",
1093
  columns=5,
1094
  height="auto",
1095
  object_fit="contain",
 
1099
  interface = gr.Interface(
1100
  fn=gradio_process_pdf,
1101
  inputs=input_file,
1102
+ # Outputs list remains the same, but the gallery now works
1103
  outputs=[
1104
  output_pages,
1105
  output_equations,
 
1108
  output_page_counts,
1109
  output_gallery
1110
  ],
1111
+ title="📊 YOLO Counting with Per-Page Data & Timing (Concurrency Fix)",
1112
  description=(
1113
+ "Upload a PDF to run YOLO detection. The concurrency bug and Gradio Gallery display error have been fixed."
 
1114
  ),
1115
  )
1116
 
1117
  print("\nStarting Gradio application...")
1118
+ interface.launch(inbrowser=True)