Yaz Hobooti commited on
Commit
def48ce
Β·
1 Parent(s): 07087d8

Add multi-page PDF support: process up to 5 pages and combine vertically

Browse files
Files changed (1) hide show
  1. pdf_comparator.py +49 -15
pdf_comparator.py CHANGED
@@ -114,7 +114,7 @@ def normalize_token(token: str) -> str:
114
  def _is_pdf(path: str) -> bool:
115
  return os.path.splitext(path.lower())[1] == ".pdf"
116
 
117
- def load_first_page(path: str, dpi: int = 300) -> Image.Image:
118
  if _is_pdf(path):
119
  # Try pdf2image with multiple poppler paths first
120
  poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
@@ -122,14 +122,14 @@ def load_first_page(path: str, dpi: int = 300) -> Image.Image:
122
  for poppler_path in poppler_paths:
123
  try:
124
  if poppler_path:
125
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path=poppler_path)
126
  else:
127
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
128
 
129
  if not imgs:
130
  continue
131
 
132
- return imgs[0].convert("RGB")
133
  except Exception as e:
134
  if poppler_path is None: # All pdf2image attempts failed
135
  break
@@ -139,20 +139,48 @@ def load_first_page(path: str, dpi: int = 300) -> Image.Image:
139
  if HAS_PYMUPDF:
140
  try:
141
  doc = fitz.open(path)
142
- page = doc[0] # First page
143
- mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
144
- pix = page.get_pixmap(matrix=mat)
145
- img_data = pix.tobytes("ppm")
146
- img = Image.open(io.BytesIO(img_data))
 
 
 
147
  doc.close()
148
- return img.convert("RGB")
149
  except Exception as e:
150
  raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
151
  else:
152
  raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
153
 
154
  raise ValueError(f"No pages in PDF: {path}")
155
- return Image.open(path).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
158
  if a.size == b.size:
@@ -455,9 +483,13 @@ def compare_pdfs(file_a, file_b):
455
  if file_a is None or file_b is None:
456
  return None, None, None, "❌ Please upload both PDF files to compare", [], []
457
 
458
- # Load images with default settings
459
- a = load_first_page(file_a.name, dpi=300)
460
- b = load_first_page(file_b.name, dpi=300)
 
 
 
 
461
 
462
  # Match sizes
463
  a, b = match_sizes(a, b)
@@ -495,10 +527,11 @@ def compare_pdfs(file_a, file_b):
495
  # Create status message
496
  status = f"""
497
  πŸ“Š **Analysis Complete!**
 
498
  - **Difference regions found:** {len(red_boxes)}
499
  - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
500
  - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
501
- - **Image dimensions:** {a.width} Γ— {a.height} pixels
502
 
503
  **Legend:**
504
  - πŸ”΄ Red boxes: Visual differences
@@ -525,6 +558,7 @@ def create_demo():
525
  # πŸ” Advanced PDF Comparison Tool
526
 
527
  Upload two PDF files to get comprehensive analysis including:
 
528
  - **Visual differences** with bounding boxes
529
  - **OCR and spell checking**
530
  - **Barcode/QR code detection**
 
114
  def _is_pdf(path: str) -> bool:
115
  return os.path.splitext(path.lower())[1] == ".pdf"
116
 
117
+ def load_pdf_pages(path: str, dpi: int = 300, max_pages: int = 5) -> List[Image.Image]:
118
  if _is_pdf(path):
119
  # Try pdf2image with multiple poppler paths first
120
  poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
 
122
  for poppler_path in poppler_paths:
123
  try:
124
  if poppler_path:
125
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
126
  else:
127
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
128
 
129
  if not imgs:
130
  continue
131
 
132
+ return [img.convert("RGB") for img in imgs]
133
  except Exception as e:
134
  if poppler_path is None: # All pdf2image attempts failed
135
  break
 
139
  if HAS_PYMUPDF:
140
  try:
141
  doc = fitz.open(path)
142
+ pages = []
143
+ for page_num in range(min(len(doc), max_pages)):
144
+ page = doc[page_num]
145
+ mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
146
+ pix = page.get_pixmap(matrix=mat)
147
+ img_data = pix.tobytes("ppm")
148
+ img = Image.open(io.BytesIO(img_data))
149
+ pages.append(img.convert("RGB"))
150
  doc.close()
151
+ return pages
152
  except Exception as e:
153
  raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
154
  else:
155
  raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
156
 
157
  raise ValueError(f"No pages in PDF: {path}")
158
+ return [Image.open(path).convert("RGB")]
159
+
160
+ def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
161
+ """Combine multiple pages into a single vertical image"""
162
+ if not pages:
163
+ raise ValueError("No pages to combine")
164
+ if len(pages) == 1:
165
+ return pages[0]
166
+
167
+ # Find the maximum width
168
+ max_width = max(page.width for page in pages)
169
+
170
+ # Calculate total height
171
+ total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
172
+
173
+ # Create combined image
174
+ combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
175
+
176
+ y_offset = 0
177
+ for page in pages:
178
+ # Center the page horizontally if it's narrower than max_width
179
+ x_offset = (max_width - page.width) // 2
180
+ combined.paste(page, (x_offset, y_offset))
181
+ y_offset += page.height + spacing
182
+
183
+ return combined
184
 
185
  def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
186
  if a.size == b.size:
 
483
  if file_a is None or file_b is None:
484
  return None, None, None, "❌ Please upload both PDF files to compare", [], []
485
 
486
+ # Load images with multiple pages support
487
+ pages_a = load_pdf_pages(file_a.name, dpi=300, max_pages=5)
488
+ pages_b = load_pdf_pages(file_b.name, dpi=300, max_pages=5)
489
+
490
+ # Combine pages into single images for comparison
491
+ a = combine_pages_vertically(pages_a)
492
+ b = combine_pages_vertically(pages_b)
493
 
494
  # Match sizes
495
  a, b = match_sizes(a, b)
 
527
  # Create status message
528
  status = f"""
529
  πŸ“Š **Analysis Complete!**
530
+ - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
531
  - **Difference regions found:** {len(red_boxes)}
532
  - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
533
  - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
534
+ - **Combined image dimensions:** {a.width} Γ— {a.height} pixels
535
 
536
  **Legend:**
537
  - πŸ”΄ Red boxes: Visual differences
 
558
  # πŸ” Advanced PDF Comparison Tool
559
 
560
  Upload two PDF files to get comprehensive analysis including:
561
+ - **Multi-page PDF support** (up to 5 pages per document)
562
  - **Visual differences** with bounding boxes
563
  - **OCR and spell checking**
564
  - **Barcode/QR code detection**