AkshitShubham commited on
Commit
9231f06
·
verified ·
1 Parent(s): 1817521

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -625
app.py CHANGED
@@ -1,661 +1,229 @@
1
  import os
2
  import requests
3
- import json
4
- import gradio as gr
5
  from PIL import Image, ImageDraw
6
  import io
7
  import base64
8
- import re
9
- import fitz
10
- import zipfile
11
  import tempfile
12
- import time
13
-
14
- # --- Configuration ---
15
- NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
16
- if not NVIDIA_API_KEY:
17
- raise ValueError("NVIDIA_API_KEY environment variable not set.")
18
-
19
- NIM_API_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
20
- HEADERS = {
21
- "Authorization": f"Bearer {NVIDIA_API_KEY}",
22
- "Accept": "application/json",
23
- "Content-Type": "application/json",
24
- }
25
- MODEL_MAX_WIDTH = 1648
26
- MODEL_MAX_HEIGHT = 2048
27
-
28
- # Global store for processed data to enable download later (key is timestamp)
29
- PROCESSED_PAGES_STORE = {}
30
- CROPPED_QUESTIONS_STORE = {}
31
-
32
- # --- 1. Helpers for Page Selection, Image & API (Unchanged) ---
33
-
34
- # ... (resize_image_if_needed, call_parse_api_base64, get_question_number, process_column functions remain the same) ...
35
- # [Note: Due to space constraints, these helpers are assumed to be copied from the previous final working script]
36
-
37
- # --- Core Processing Logic (Re-included for clarity of dependencies) ---
38
-
39
- def resize_image_if_needed(image: Image.Image) -> Image.Image:
40
- width, height = image.size
41
- if width > MODEL_MAX_WIDTH or height > MODEL_MAX_HEIGHT:
42
- ratio = min(MODEL_MAX_WIDTH / width, MODEL_MAX_HEIGHT / height)
43
- new_width = int(width * ratio)
44
- new_height = int(height * ratio)
45
- return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
46
- return image
47
-
48
- def call_parse_api_base64(image_bytes: bytes):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
- base64_encoded_data = base64.b64encode(image_bytes)
51
- base64_string = base64_encoded_data.decode('utf-8')
52
- image_url = f"data:image/png;base64,{base64_string}"
53
- payload = {
54
- "model": "nvidia/nemoretriever-parse",
55
- "messages": [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}]}],
56
- "tools": [{"type": "function", "function": {"name": "markdown_bbox"}}],
57
- "max_tokens": 2048,
58
- }
59
- response = requests.post(NIM_API_URL, headers=HEADERS, json=payload, timeout=300)
60
- response.raise_for_status()
61
- return response.json()
62
- except requests.exceptions.RequestException as e:
63
- error_detail = str(e)
64
- if e.response is not None:
65
- try:
66
- error_detail = e.response.json().get("detail", e.response.text)
67
- except json.JSONDecodeError:
68
- error_detail = e.response.text
69
- raise gr.Error(f"API Error: {error_detail}")
70
-
71
- def get_question_number(text: str) -> int:
72
- match = re.match(r"^\d+", text.strip())
73
- return int(match.group(0)) if match else -1
74
-
75
- def process_and_crop(original_image: Image.Image, api_response: dict, split_page: bool):
76
- # This function now returns both the gallery images and the full question data
77
  try:
78
- tool_call = api_response["choices"][0]["message"]["tool_calls"][0]
79
- arguments_str = tool_call["function"]["arguments"]
80
- all_elements = json.loads(arguments_str)[0]
81
- except (KeyError, IndexError, json.JSONDecodeError):
82
- return original_image, [], [], 0
83
- question_starts = [elem for elem in all_elements if get_question_number(elem.get("text", "")) > 0]
84
- if not question_starts:
85
- return original_image, [], [], 0
86
- image_with_boxes = original_image.copy()
87
- img_draw = ImageDraw.Draw(image_with_boxes)
88
- all_cropped_questions = []
89
- if split_page:
90
- page_midpoint = 0.5
91
- left_starts = sorted([q for q in question_starts if q['bbox']['xmin'] < page_midpoint], key=lambda q: q['bbox']['ymin'])
92
- right_starts = sorted([q for q in question_starts if q['bbox']['xmin'] >= page_midpoint], key=lambda q: q['bbox']['ymin'])
93
- process_column(left_starts, all_elements, (0.0, page_midpoint), img_draw, original_image, all_cropped_questions)
94
- process_column(right_starts, all_elements, (page_midpoint, 1.0), img_draw, original_image, all_cropped_questions)
95
- else:
96
- sorted_starts = sorted(question_starts, key=lambda q: q['bbox']['ymin'])
97
- process_column(sorted_starts, all_elements, (0.0, 1.0), img_draw, original_image, all_cropped_questions)
98
- all_cropped_questions.sort(key=lambda item: item[0])
99
- final_gallery_images = [item[1] for item in all_cropped_questions]
100
- return image_with_boxes, final_gallery_images, all_cropped_questions, len(all_cropped_questions)
101
-
102
- def process_column(column_starts, all_elements, column_bounds, img_draw, original_image, cropped_questions_list):
103
- # This function processes a column and filters out too small crops
104
- img_width, img_height = original_image.size
105
- MIN_CROP_WIDTH = 100 # Minimum width in pixels
106
- MIN_CROP_HEIGHT = 50 # Minimum height in pixels
107
-
108
- for i, start_element in enumerate(column_starts):
109
- q_num = get_question_number(start_element['text'])
110
- slice_ymin = start_element['bbox']['ymin']
111
- if i + 1 < len(column_starts):
112
- slice_ymax = column_starts[i+1]['bbox']['ymin']
113
- else:
114
- remaining_elements = [e for e in all_elements if e['bbox']['ymin'] >= slice_ymin and column_bounds[0] <= e['bbox']['xmin'] < column_bounds[1]]
115
- slice_ymax = max(e['bbox']['ymax'] for e in remaining_elements) if remaining_elements else 1.0
116
- elements_in_slice = [e for e in all_elements if (slice_ymin <= e['bbox']['ymin'] < slice_ymax and column_bounds[0] <= e['bbox']['xmin'] < column_bounds[1])]
117
- if not elements_in_slice: continue
118
- crop_xmin = min(e['bbox']['xmin'] for e in elements_in_slice)
119
- crop_xmax = max(e['bbox']['xmax'] for e in elements_in_slice)
120
- abs_box = (crop_xmin * img_width, slice_ymin * img_height, crop_xmax * img_width, slice_ymax * img_height)
121
-
122
- # Check if crop is too small
123
- crop_width = abs_box[2] - abs_box[0]
124
- crop_height = abs_box[3] - abs_box[1]
125
- if crop_width < MIN_CROP_WIDTH or crop_height < MIN_CROP_HEIGHT:
126
- print(f"Skipping too small crop for question {q_num}: {crop_width}x{crop_height}")
127
- continue
128
-
129
- img_draw.rectangle(abs_box, outline="red", width=3)
130
- cropped_img = original_image.crop(abs_box)
131
-
132
- # Generate descriptive filename from question text
133
- question_text = start_element.get('text', '').strip()
134
- # Clean text for filename (remove special characters, limit length)
135
- clean_text = re.sub(r'[^\w\s-]', '', question_text)[:50]
136
- clean_text = re.sub(r'\s+', '_', clean_text)
137
- filename = f"{q_num}-{clean_text}" if clean_text else f"{q_num}-question"
138
-
139
- cropped_questions_list.append((q_num, cropped_img, filename))
140
-
141
-
142
- def parse_page_ranges(range_str: str) -> set:
143
- """Parses a string like '1,3,5-10' into a set of page numbers (1-based)."""
144
- # ... (function remains the same)
145
- if not range_str: return set()
146
- pages = set()
147
- parts = range_str.split(',')
148
- for part in parts:
149
- part = part.strip()
150
- if not part: continue
151
- try:
152
- if '-' in part:
153
- start, end = map(int, part.split('-'))
154
- if start > end: continue
155
- pages.update(range(start, end + 1))
156
- else:
157
- pages.add(int(part))
158
- except ValueError:
159
- continue
160
- return pages
161
-
162
-
163
- # --- 4. NEW DOWNLOADER FUNCTION ---
164
-
165
- def upload_to_report_app(selected_indices_str: str, session_id: str):
166
  """
167
- Uploads selected questions to the Report App (Flask app on port 1302) and returns redirect URL.
 
168
  """
169
- print(f"🚀 REPORT APP UPLOAD - Starting upload process")
170
- print(f"📝 Selected indices string: '{selected_indices_str}'")
171
- print(f"🔑 Session ID: {session_id}")
172
-
173
- if session_id not in CROPPED_QUESTIONS_STORE:
174
- print(f"❌ Session {session_id} not found in CROPPED_QUESTIONS_STORE")
175
- print(f"📋 Available sessions: {list(CROPPED_QUESTIONS_STORE.keys())}")
176
- raise gr.Error("No processed questions found. Please run the extraction first.")
177
-
178
- cropped_questions = CROPPED_QUESTIONS_STORE[session_id]
179
- print(f"📊 Found {len(cropped_questions)} questions in session")
180
-
181
- if not cropped_questions:
182
- print("❌ No questions found in session")
183
- raise gr.Error("No questions were extracted from the processed files.")
184
-
185
- # If no selection specified, upload all questions
186
- if not selected_indices_str.strip():
187
- selected_indices = set(item[0] for item in cropped_questions)
188
- print(f"📌 No selection specified, using all questions: {selected_indices}")
189
- else:
190
- selected_indices = parse_page_ranges(selected_indices_str)
191
- print(f"📌 Parsed selection: {selected_indices}")
192
- if not selected_indices:
193
- print("❌ No valid indices parsed")
194
- raise gr.Error("Please enter valid question numbers/ranges.")
195
-
196
  try:
197
- print("🔧 Preparing files for upload...")
198
- # Prepare files for upload to Flask app
199
- files = []
200
- selected_questions = []
201
-
202
- for i, question_data in enumerate(cropped_questions):
203
- print(f"🔍 Processing question {i+1}/{len(cropped_questions)}: {question_data[0]} (type: {type(question_data)})")
204
-
205
- if len(question_data) >= 3:
206
- q_num, img, filename = question_data[0], question_data[1], question_data[2]
207
- print(f" ✅ Question {q_num}, filename: {filename}")
208
-
209
- if q_num in selected_indices:
210
- print(f" 🎯 Question {q_num} is selected for upload")
211
-
212
- # Convert PIL image to bytes
213
- img_io = io.BytesIO()
214
- print(f" 🖼️ Converting image to bytes (size: {img.size})")
215
- img.save(img_io, format='PNG')
216
- img_bytes = img_io.getvalue()
217
- print(f" 💾 Image converted to {len(img_bytes)} bytes")
218
-
219
- # Create file tuple for requests
220
- file_tuple = ('images', (f"{filename}.png", img_bytes, 'image/png'))
221
- files.append(file_tuple)
222
- selected_questions.append({'q_num': q_num, 'filename': filename})
223
- print(f" ✅ Added to upload list")
224
- else:
225
- print(f" ⏭️ Question {q_num} not in selection, skipping")
226
- else:
227
- print(f" ❌ Invalid question data format: {len(question_data)} items")
228
-
229
- print(f"📦 Prepared {len(files)} files for upload")
230
- print(f"📋 Selected questions: {[q['q_num'] for q in selected_questions]}")
231
-
232
- if not files:
233
- print("❌ No files prepared for upload")
234
- raise gr.Error("No matching questions found to upload.")
235
-
236
- # Upload to Flask app
237
- flask_url = 'http://localhost:1302/upload'
238
- print(f"🌐 Making POST request to: {flask_url}")
239
- print(f"📤 Uploading {len(files)} files...")
240
-
241
- response = requests.post(
242
- flask_url,
243
- files=files,
244
- timeout=30
245
- )
246
-
247
- print(f"📡 Response status: {response.status_code}")
248
- print(f"📡 Response headers: {dict(response.headers)}")
249
- print(f"📡 Response text: {response.text[:500]}...") # First 500 chars
250
-
251
- if response.status_code == 200:
252
- print("✅ Upload successful!")
253
- try:
254
- result = response.json()
255
- print(f"📋 Response JSON: {result}")
256
-
257
- flask_session_id = result.get('session_id')
258
- print(f"🔑 Flask session ID: {flask_session_id}")
259
-
260
- if flask_session_id:
261
- # Return the URL to redirect to question entry page
262
- redirect_url = f"http://localhost:1302/question_entry/{flask_session_id}"
263
- print(f"🎯 Generated redirect URL: {redirect_url}")
264
- return redirect_url
265
- else:
266
- print("❌ No session_id in Flask response")
267
- raise gr.Error("Failed to get session ID from Report App.")
268
- except json.JSONDecodeError as e:
269
- print(f"❌ JSON decode error: {e}")
270
- print(f"📄 Raw response: {response.text}")
271
- raise gr.Error("Invalid JSON response from Report App.")
272
  else:
273
- print(f"❌ HTTP error: {response.status_code}")
274
- print(f"📄 Error response: {response.text}")
275
- raise gr.Error(f"Upload failed: {response.status_code} - {response.text}")
276
-
277
- except requests.exceptions.ConnectionError as e:
278
- print(f"❌ Connection error: {e}")
279
- raise gr.Error("Could not connect to Report App. Make sure it's running on port 1302.")
280
- except requests.exceptions.Timeout as e:
281
- print(f"❌ Timeout error: {e}")
282
- raise gr.Error("Upload timed out. Please try again.")
283
  except Exception as e:
284
- print(f" Unexpected error: {type(e).__name__}: {e}")
285
- import traceback
286
- print(f"📋 Traceback: {traceback.format_exc()}")
287
- raise gr.Error(f"Upload error: {str(e)}")
288
-
289
-
290
- def zip_selected_questions(selected_indices_str: str, session_id: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  """
292
- Creates a ZIP file containing the individual question images selected by the user.
293
- `selected_indices_str` is a comma-separated string of question numbers.
294
- If empty, downloads all questions.
295
  """
296
-
297
- if session_id not in CROPPED_QUESTIONS_STORE:
298
- raise gr.Error("No processed questions found. Please run the extraction first.")
299
-
300
- cropped_questions = CROPPED_QUESTIONS_STORE[session_id]
301
-
302
- if not cropped_questions:
303
- raise gr.Error("No questions were extracted from the processed files.")
304
-
305
- # If no selection specified, download all questions
306
- if not selected_indices_str.strip():
307
- selected_indices = set(item[0] for item in cropped_questions)
308
- else:
309
- selected_indices = parse_page_ranges(selected_indices_str)
310
- if not selected_indices:
311
- raise gr.Error("Please enter valid question numbers/ranges to download.")
312
-
313
- # Create temporary zip file
314
- zip_path = os.path.join(tempfile.gettempdir(), f"questions_{session_id}.zip")
315
-
316
- with zipfile.ZipFile(zip_path, 'w') as zf:
317
- for question_data in cropped_questions:
318
- q_num, img, filename = question_data
319
-
320
- if q_num in selected_indices:
321
- # Save image to bytes buffer
322
- img_io = io.BytesIO()
323
- img.save(img_io, format='PNG')
324
- img_io.seek(0)
325
-
326
- # Add to zip file with descriptive name
327
- zf.writestr(f"{filename}.png", img_io.read())
328
 
329
- return zip_path
 
 
330
 
 
 
 
 
331
 
332
- def zip_selected_pages(selected_indices_str: str, session_id: str):
333
- """
334
- Creates a ZIP file containing the processed pages selected by the user.
335
- `selected_indices_str` is a comma-separated string of 1-based indices (0-based in Python).
336
- If empty, downloads all pages.
337
- """
338
-
339
- if session_id not in PROCESSED_PAGES_STORE:
340
- raise gr.Error("No processed results found. Please run the extraction first.")
341
 
342
- processed_pages = PROCESSED_PAGES_STORE[session_id]
343
-
344
- if not processed_pages:
345
- raise gr.Error("No pages were processed.")
346
-
347
- # If no selection specified, download all pages
348
- if not selected_indices_str.strip():
349
- selected_indices = set(range(1, len(processed_pages) + 1)) # 1-based indexing
350
- else:
351
- selected_indices = parse_page_ranges(selected_indices_str)
352
- if not selected_indices:
353
- raise gr.Error("Please enter valid page numbers/ranges to download.")
354
-
355
- # Create temporary zip file
356
- zip_path = os.path.join(tempfile.gettempdir(), f"processed_pages_{session_id}.zip")
357
-
358
- with zipfile.ZipFile(zip_path, 'w') as zf:
359
- for user_page_num in selected_indices:
360
- # Convert 1-based user input to 0-based list index
361
- list_index = user_page_num - 1
362
 
363
- if 0 <= list_index < len(processed_pages):
364
- img = processed_pages[list_index]
365
-
366
- # Save image to bytes buffer
367
- img_io = io.BytesIO()
368
- img.save(img_io, format='PNG')
369
- img_io.seek(0)
370
-
371
- # Add to zip file
372
- zf.writestr(f"Page_{user_page_num}_boxed.png", img_io.read())
373
- else:
374
- print(f"Warning: Page {user_page_num} is out of bounds and skipped.")
375
-
376
- return zip_path
377
-
378
-
379
- # --- 5. Main Gradio Function (Updated Inputs) ---
380
-
381
- def question_extractor_app(pdf_file, image_file, split_page_toggle, page_selection_str):
382
-
383
- # Determine the file source
384
- if pdf_file and image_file:
385
- raise gr.Error("Please upload either a PDF or an Image, not both.")
386
- elif pdf_file:
387
- input_filepath = pdf_file.name
388
- elif image_file:
389
- input_filepath = image_file.name
390
- else:
391
- raise gr.Error("Please upload a file.")
392
-
393
- if not NVIDIA_API_KEY:
394
- raise gr.Error("NVIDIA_API_KEY is not set. Please configure your environment variables.")
395
-
396
- # --- File Loading ---
397
- page_images_to_process = []
398
-
399
- if input_filepath.lower().endswith('.pdf'):
400
- selected_pages = parse_page_ranges(page_selection_str)
401
- doc = fitz.open(input_filepath)
402
- for page_num in range(len(doc)):
403
- if not selected_pages or (page_num + 1) in selected_pages:
404
- page = doc.load_page(page_num)
405
- pix = page.get_pixmap(dpi=300)
406
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
407
- page_images_to_process.append(img)
408
- doc.close()
409
- else:
410
- # Note: Page selection is ignored for single image files
411
- page_images_to_process.append(Image.open(input_filepath))
412
-
413
- if not page_images_to_process:
414
- return [], [], "", "", "No pages were selected or the file is empty.", "No questions found."
415
 
416
- # --- Processing ---
417
- all_processed_pages = []
418
- all_gallery_images = []
419
- all_question_data = [] # Store the full question data with metadata
420
- total_questions_found = 0
421
 
422
- for i, page_img in enumerate(page_images_to_process):
423
- processed_image = resize_image_if_needed(page_img)
424
- with io.BytesIO() as img_byte_arr:
425
- processed_image.save(img_byte_arr, format='PNG')
426
- image_bytes = img_byte_arr.getvalue()
427
 
428
- api_response = call_parse_api_base64(image_bytes)
429
- image_with_boxes, gallery_from_page, question_data_from_page, num_found = process_and_crop(processed_image, api_response, split_page_toggle)
 
 
 
 
 
 
430
 
431
- all_processed_pages.append(image_with_boxes)
432
- all_gallery_images.extend(gallery_from_page)
433
- all_question_data.extend(question_data_from_page)
434
- total_questions_found += num_found
435
-
436
- summary = f"Processed {len(page_images_to_process)} page(s) and found a total of {total_questions_found} questions."
437
-
438
- # Store processed data and generate unique session ID for download
439
- session_id = str(time.time()).replace('.', '')
440
- PROCESSED_PAGES_STORE[session_id] = all_processed_pages
441
- CROPPED_QUESTIONS_STORE[session_id] = all_question_data
442
 
443
- # Generate strings for download info
444
- available_pages_str = ", ".join(str(i+1) for i in range(len(all_processed_pages)))
445
- available_questions_str = ", ".join(str(item[0]) for item in all_question_data)
446
-
447
- return (all_processed_pages, all_gallery_images, summary, session_id,
448
- f"Available pages: {available_pages_str}", f"Available questions: {available_questions_str}")
449
 
450
- # --- 6. Launch the App ---
451
  if __name__ == "__main__":
452
-
453
- with gr.Blocks(title="NIM Question Extractor", theme=gr.themes.Soft()) as demo:
454
  gr.Markdown(
455
  """
456
- # 📄 NVIDIA NIM Question Extractor
457
- Extract and crop individual questions from PDF documents or images with multi-column support and download capabilities.
 
 
458
  """
459
  )
460
-
461
- # Input Section
462
- with gr.Group():
463
- gr.Markdown("## 📁 Input Files")
464
- with gr.Row():
465
- pdf_input = gr.File(
466
- label="Upload PDF File",
467
- file_types=['.pdf'],
468
- scale=1
469
- )
470
- image_input = gr.File(
471
- label="Upload Image File",
472
- file_types=['.png', '.jpg', '.jpeg'],
473
- scale=1
474
- )
475
-
476
- # Processing Options Section
477
- with gr.Group():
478
- gr.Markdown("## ⚙️ Processing Options")
479
- with gr.Row():
480
- with gr.Column(scale=2):
481
- page_select_input = gr.Textbox(
482
- label="Select Pages (PDF only)",
483
- placeholder="e.g., 1, 3, 5-10 (leave blank for all pages)",
484
- info="Enter page numbers or ranges separated by commas"
485
- )
486
- with gr.Column(scale=1):
487
- split_toggle = gr.Checkbox(
488
- label="Two-Column Layout",
489
- info="Check if document has two columns"
490
- )
491
-
492
- # Action Button
493
  with gr.Row():
494
- submit_btn = gr.Button(
495
- "🚀 Start Question Extraction",
496
- variant="primary",
497
- size="lg"
498
- )
499
-
500
- # Hidden session management
501
- session_id_output = gr.Textbox(visible=False)
502
-
503
- # Results Section
504
- with gr.Group():
505
- gr.Markdown("## 📊 Results")
506
-
507
- # Summary
508
- output_summary = gr.Textbox(
509
- label="Processing Summary",
510
- interactive=False,
511
- show_copy_button=True
512
- )
513
-
514
- # Download Sections
515
- with gr.Row():
516
- # Pages Download
517
- with gr.Column(scale=1):
518
- gr.Markdown("### 📄 Download Pages (with boxes)")
519
- download_pages_info = gr.Textbox(
520
- label="Available Pages",
521
- interactive=False,
522
- placeholder="Process files first"
523
- )
524
- download_pages_input = gr.Textbox(
525
- label="Select Pages",
526
- placeholder="e.g., 1-3, 5 (leave blank for all)",
527
- info="Pages with red boxes"
528
- )
529
- download_pages_btn = gr.DownloadButton(
530
- "📥 Download Pages ZIP",
531
- interactive=False,
532
- variant="secondary"
533
- )
534
-
535
- # Questions Download
536
- with gr.Column(scale=1):
537
- gr.Markdown("### 🔍 Download Questions")
538
- download_questions_info = gr.Textbox(
539
- label="Available Questions",
540
- interactive=False,
541
- placeholder="Process files first"
542
- )
543
- download_questions_input = gr.Textbox(
544
- label="Select Questions",
545
- placeholder="e.g., 1-5, 8, 10-12 (leave blank for all)",
546
- info="Individual question images"
547
- )
548
- download_questions_btn = gr.DownloadButton(
549
- "📥 Download Questions ZIP",
550
- interactive=False,
551
- variant="primary"
552
- )
553
-
554
- # Report App Integration
555
- with gr.Column(scale=1):
556
- gr.Markdown("### 📝 Report App")
557
- report_app_input = gr.Textbox(
558
- label="Select Questions for Report",
559
- placeholder="e.g., 1-5, 8 (leave blank for all)",
560
- info="Upload to Report App for analysis"
561
- )
562
- report_app_output = gr.Textbox(
563
- label="Report App URL",
564
- interactive=False,
565
- placeholder="Upload questions to get redirect URL",
566
- show_copy_button=True
567
- )
568
- with gr.Row():
569
- report_upload_btn = gr.Button(
570
- "🚀 Upload to Report App",
571
- interactive=False,
572
- variant="primary"
573
- )
574
- report_open_btn = gr.Button(
575
- "🔗 Open Report App",
576
- interactive=False,
577
- link="",
578
- variant="secondary"
579
- )
580
-
581
- # Image Galleries
582
- with gr.Group():
583
- gr.Markdown("## 🖼️ Visual Results")
584
-
585
- with gr.Tab("Processed Pages (with boxes)"):
586
- output_processed_pages = gr.Gallery(
587
- label="Pages with Question Boundaries",
588
- height=400,
589
- columns=2,
590
- object_fit="contain",
591
- show_label=False
592
- )
593
-
594
- with gr.Tab("Individual Questions"):
595
- output_cropped_gallery = gr.Gallery(
596
- label="Cropped Questions (sorted by number)",
597
- height=400,
598
- columns=4,
599
- object_fit="contain",
600
- show_label=False
601
- )
602
-
603
- # --- Event Handlers ---
604
-
605
- # Main processing handler
606
- submit_btn.click(
607
- fn=question_extractor_app,
608
- inputs=[pdf_input, image_input, split_toggle, page_select_input],
609
- outputs=[output_processed_pages, output_cropped_gallery, output_summary,
610
- session_id_output, download_pages_info, download_questions_info]
611
- ).then(
612
- # Re-enable download buttons after results are ready
613
- lambda: (gr.DownloadButton(interactive=True), gr.DownloadButton(interactive=True), gr.Button(interactive=True)),
614
- outputs=[download_pages_btn, download_questions_btn, report_upload_btn]
615
- )
616
-
617
- # Download handlers
618
- download_pages_btn.click(
619
- fn=zip_selected_pages,
620
- inputs=[download_pages_input, session_id_output],
621
- outputs=[download_pages_btn],
622
- api_name=False
623
- )
624
-
625
- download_questions_btn.click(
626
- fn=zip_selected_questions,
627
- inputs=[download_questions_input, session_id_output],
628
- outputs=[download_questions_btn],
629
- api_name=False
630
- )
631
-
632
- # Report App handlers
633
- def handle_report_upload(questions_input, session_id):
634
- try:
635
- url = upload_to_report_app(questions_input, session_id)
636
- return url, gr.Button(interactive=True, link=url)
637
- except Exception as e:
638
- return f"Error: {str(e)}", gr.Button(interactive=False)
639
-
640
- report_upload_btn.click(
641
- fn=handle_report_upload,
642
- inputs=[report_app_input, session_id_output],
643
- outputs=[report_app_output, report_open_btn]
644
  )
645
 
646
- # Footer
647
- gr.Markdown(
648
- """
649
- ---
650
- 💡 **Tips:**
651
- - Upload either a PDF or image file, not both
652
- - Use page selection to process specific pages from PDFs
653
- - Enable two-column layout for documents with side-by-side content
654
- - **Pages ZIP**: Contains full pages with red boxes showing question boundaries
655
- - **Questions ZIP**: Contains individual cropped question images with descriptive names
656
- - **Report App**: Upload questions to the analysis app on port 1302 for detailed reporting
657
- - **Leave download/upload fields blank to process ALL pages/questions**
658
- """
659
- )
660
 
661
  demo.launch(debug=True)
 
1
  import os
2
  import requests
 
 
3
  from PIL import Image, ImageDraw
4
  import io
5
  import base64
6
+ import json
7
+ import gradio as gr
8
+ import fitz # PyMuPDF
9
  import tempfile
10
+ from typing import Union
11
+
12
+ # --- Configuration & API Constants ---
13
+ INVOKE_URL_OCR = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
14
+ INVOKE_URL_PARSER = "https://integrate.api.nvidia.com/v1/chat/completions"
15
+ MAX_PIXELS_FOR_PARSER = 1024 * 1024 # 1 Megapixel
16
+
17
+ # =================================================================================
18
+ # SELF-CONTAINED REDACTION LOGIC
19
+ # (This is the refined function from the previous step)
20
+ # =================================================================================
21
+
22
+ def _get_average_color_from_regions(image: Image.Image, regions: list[tuple]):
23
+ """Calculates the average RGB color from a list of regions in an image."""
24
+ total_r, total_g, total_b = 0, 0, 0; pixel_count = 0
25
+ img_width, img_height = image.size
26
+ if image.mode == 'RGBA': image = image.convert('RGB')
27
+ pixels = image.load()
28
+ for region in regions:
29
+ x1, y1, x2, y2 = [max(0, int(c)) for c in region]
30
+ x2 = min(img_width, x2); y2 = min(img_height, y2)
31
+ for x in range(x1, x2):
32
+ for y in range(y1, y2):
33
+ r, g, b = pixels[x, y]
34
+ total_r += r; total_g += g; total_b += b
35
+ pixel_count += 1
36
+ if pixel_count == 0: return (0, 0, 0)
37
+ return (total_r // pixel_count, total_g // pixel_count, total_b // pixel_count)
38
+
39
+ def _detect_pictures_with_parser(image_to_process: Image.Image, api_key: str):
40
+ """Sends an image to the NemoRetriever Parser model to detect 'Picture' elements."""
41
+ headers = {"Authorization": f"Bearer {api_key}", "Accept": "application/json"}
42
+ buffered = io.BytesIO()
43
+ image_to_process.save(buffered, format="PNG")
44
+ b64_str = base64.b64encode(buffered.getvalue()).decode("ascii")
45
+ content = f'<img src="data:image/png;base64,{b64_str}" />'
46
+ tool_name = "markdown_bbox"
47
+ payload = {
48
+ "model": "nvidia/nemoretriever-parse", "messages": [{"role": "user", "content": content}],
49
+ "tools": [{"type": "function", "function": {"name": tool_name}}],
50
+ "tool_choice": {"type": "function", "function": {"name": tool_name}}, "max_tokens": 2048,
51
+ }
52
+ response = requests.post(INVOKE_URL_PARSER, headers=headers, json=payload, timeout=120)
53
+ response.raise_for_status()
54
+ response_json = response.json()
55
+ picture_bboxes = []
56
+ tool_calls = response_json.get('choices', [{}])[0].get('message', {}).get('tool_calls', [])
57
+ if tool_calls:
58
+ arguments_str = tool_calls[0].get('function', {}).get('arguments', '[]')
59
+ parsed_arguments = json.loads(arguments_str)
60
+ if parsed_arguments and isinstance(parsed_arguments, list):
61
+ for element in parsed_arguments[0]:
62
+ if element.get("type") == "Picture" and element.get("bbox"):
63
+ picture_bboxes.append(element["bbox"])
64
+ return picture_bboxes
65
+
66
+ def _redact_text_in_image(input_image: Image.Image, api_key: str):
67
+ """Sends a (cropped) image to the OCR model and returns a redacted version."""
68
+ headers = {"Authorization": f"Bearer {api_key}", "Accept": "application/json"}
69
+ buffered = io.BytesIO(); input_image.save(buffered, format="PNG")
70
+ image_b64 = base64.b64encode(buffered.getvalue()).decode()
71
+ payload = {"input": [{"type": "image_url", "url": f"data:image/png;base64,{image_b64}"}]}
72
  try:
73
+ response = requests.post(INVOKE_URL_OCR, headers=headers, json=payload, timeout=60)
74
+ response.raise_for_status(); response_json = response.json()
75
+ except requests.exceptions.RequestException: return input_image
76
+ image_with_redactions = input_image.copy(); draw = ImageDraw.Draw(image_with_redactions)
77
+ img_width, img_height = image_with_redactions.size
78
+ radius = max(1, int(((img_width**2 + img_height**2)**0.5) / 100))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
+ detections = response_json['data'][0]['text_detections']
81
+ for detection in detections:
82
+ bbox = detection.get("bounding_box")
83
+ if bbox and bbox.get("points"):
84
+ points = bbox["points"]
85
+ p1 = (points[0]['x'] * img_width, points[0]['y'] * img_height); p3 = (points[2]['x'] * img_width, points[2]['y'] * img_height)
86
+ sample_regions = [(p1[0], p1[1] - radius, p3[0], p1[1]), (p1[0], p3[1], p3[0], p3[1] + radius), (p1[0] - radius, p1[1], p1[0], p3[1]), (p3[0], p1[1], p3[0] + radius, p3[1])]
87
+ redaction_color = _get_average_color_from_regions(image_with_redactions, sample_regions)
88
+ draw.rectangle([p1, p3], fill=redaction_color)
89
+ return image_with_redactions
90
+ except (KeyError, IndexError, TypeError): return input_image
91
+
92
+ def redact_pictures_in_image(image_source: Union[str, Image.Image], api_key: str, callback: callable = None) -> Image.Image:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  """
94
+ Analyzes an image to find pictures, then redacts text within those pictures.
95
+ Now accepts a file path, base64 string, or a PIL Image object directly.
96
  """
97
+ def _progress(message: str):
98
+ if callback: callback(message)
99
+ _progress("Loading image for processing...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  try:
101
+ if isinstance(image_source, Image.Image):
102
+ input_image = image_source.convert("RGB")
103
+ elif os.path.exists(image_source):
104
+ input_image = Image.open(image_source).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  else:
106
+ input_image = Image.open(io.BytesIO(base64.b64decode(image_source))).convert("RGB")
 
 
 
 
 
 
 
 
 
107
  except Exception as e:
108
+ raise ValueError(f"Invalid image_source. Error: {e}")
109
+ image_to_analyze = input_image
110
+ original_width, original_height = input_image.size
111
+ if (original_width * original_height) > MAX_PIXELS_FOR_PARSER:
112
+ _progress(f"Image is large, resizing for analysis...")
113
+ scale = (MAX_PIXELS_FOR_PARSER / (original_width * original_height))**0.5
114
+ new_dims = (int(original_width * scale), int(original_height * scale))
115
+ image_to_analyze = input_image.resize(new_dims, Image.Resampling.LANCZOS)
116
+ _progress("Detecting 'Picture' elements...")
117
+ try:
118
+ picture_bboxes = _detect_pictures_with_parser(image_to_analyze, api_key)
119
+ except requests.exceptions.RequestException as e:
120
+ _progress(f"API Error during picture detection: {e}"); raise
121
+ if not picture_bboxes:
122
+ _progress("No 'Picture' elements found.")
123
+ return input_image
124
+ _progress(f"Found {len(picture_bboxes)} 'Picture' element(s). Redacting text...")
125
+ final_image = input_image.copy()
126
+ for i, box in enumerate(picture_bboxes):
127
+ _progress(f" - Processing picture {i + 1}/{len(picture_bboxes)}...")
128
+ x1, y1 = int(box["xmin"] * original_width), int(box["ymin"] * original_height)
129
+ x2, y2 = int(box["xmax"] * original_width), int(box["ymax"] * original_height)
130
+ cropped_element = input_image.crop((x1, y1, x2, y2))
131
+ redacted_crop = _redact_text_in_image(cropped_element, api_key)
132
+ final_image.paste(redacted_crop, (x1, y1))
133
+ _progress("Redaction for this page complete.")
134
+ return final_image
135
+
136
+ # =================================================================================
137
+ # GRADIO PDF PROCESSING APPLICATION
138
+ # =================================================================================
139
+
140
+ def process_pdf(pdf_file, progress=gr.Progress(track_tqdm=True)):
141
  """
142
+ Main function for the Gradio app. Takes an uploaded PDF file, processes each
143
+ page, and returns the path to the redacted output PDF.
 
144
  """
145
+ if pdf_file is None:
146
+ raise gr.Error("Please upload a PDF file.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ api_key = os.getenv("NVIDIA_API_KEY")
149
+ if not api_key:
150
+ raise gr.Error("NVIDIA_API_KEY environment variable not set.")
151
 
152
+ log_messages = []
153
+ def progress_callback(message):
154
+ print(message) # Also print to console for debugging
155
+ log_messages.append(message)
156
 
157
+ try:
158
+ pdf_path = pdf_file.name
159
+ doc = fitz.open(pdf_path)
160
+ processed_pages = []
 
 
 
 
 
161
 
162
+ for page_num in progress.tqdm(range(len(doc)), desc="Processing PDF Pages"):
163
+ progress_callback(f"\n--- Processing Page {page_num + 1} of {len(doc)} ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ # Convert page to image (150 DPI is a good balance of quality and size)
166
+ page = doc.load_page(page_num)
167
+ pix = page.get_pixmap(dpi=150)
168
+ page_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
169
+
170
+ # Run the redaction pipeline on the single page image
171
+ processed_image = redact_pictures_in_image(
172
+ image_source=page_image,
173
+ api_key=api_key,
174
+ callback=progress_callback
175
+ )
176
+ processed_pages.append(processed_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ progress_callback("\n--- Finalizing PDF ---")
179
+ if not processed_pages:
180
+ raise gr.Error("No pages were processed from the PDF.")
 
 
181
 
182
+ # Save processed images into a new PDF
183
+ output_pdf_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
 
 
 
184
 
185
+ processed_pages[0].save(
186
+ output_pdf_path,
187
+ "PDF",
188
+ resolution=100.0,
189
+ save_all=True,
190
+ append_images=processed_pages[1:]
191
+ )
192
+ progress_callback(f"Successfully created redacted PDF: {os.path.basename(output_pdf_path)}")
193
 
194
+ return output_pdf_path, "\n".join(log_messages)
195
+
196
+ except Exception as e:
197
+ gr.Error(f"An error occurred: {e}")
198
+ return None, f"An error occurred: {e}"
 
 
 
 
 
 
199
 
 
 
 
 
 
 
200
 
201
+ # --- Gradio UI Definition ---
202
  if __name__ == "__main__":
203
+ with gr.Blocks(theme=gr.themes.Default(), title="NVIDIA PDF Redactor") as demo:
 
204
  gr.Markdown(
205
  """
206
+ # document Redactor for Pictures
207
+ Upload a PDF document. The tool will scan each page for pictures, redact any text found exclusively
208
+ within those pictures, and then generate a new, downloadable PDF with the redactions.
209
+ Pages without pictures are skipped to save time and cost.
210
  """
211
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  with gr.Row():
213
+ with gr.Column(scale=1):
214
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
215
+ process_btn = gr.Button("🚀 Process PDF and Redact Pictures", variant="primary")
216
+ with gr.Column(scale=2):
217
+ pdf_output = gr.File(label="Download Redacted PDF", interactive=False)
218
+ status_log = gr.Textbox(label="Processing Log", lines=15, interactive=False)
219
+
220
+ process_btn.click(
221
+ fn=process_pdf,
222
+ inputs=[pdf_input],
223
+ outputs=[pdf_output, status_log]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  )
225
 
226
+ gr.Markdown("---")
227
+ gr.Markdown("Powered by [NVIDIA NIM](https://build.nvidia.com/explore/discover).")
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  demo.launch(debug=True)