|
|
|
|
|
""" |
|
|
Gradio PDF Comparison Tool |
|
|
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. |
|
|
""" |
|
|
|
|
|
import os, sys, re, csv, json, io |
|
|
from dataclasses import dataclass |
|
|
from typing import List, Tuple, Optional, Iterable |
|
|
import tempfile |
|
|
import unicodedata |
|
|
|
|
|
import numpy as np |
|
|
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError |
|
|
from pdf2image import convert_from_path |
|
|
from skimage.measure import label, regionprops |
|
|
from skimage.morphology import dilation, rectangle |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
try: |
|
|
import fitz |
|
|
HAS_PYMUPDF = True |
|
|
except Exception: |
|
|
fitz = None |
|
|
HAS_PYMUPDF = False |
|
|
|
|
|
|
|
|
try: |
|
|
import pytesseract |
|
|
HAS_OCR = True |
|
|
except Exception: |
|
|
pytesseract = None |
|
|
HAS_OCR = False |
|
|
|
|
|
try: |
|
|
from spellchecker import SpellChecker |
|
|
HAS_SPELLCHECK = True |
|
|
except Exception: |
|
|
SpellChecker = None |
|
|
HAS_SPELLCHECK = False |
|
|
|
|
|
try: |
|
|
import regex as re |
|
|
HAS_REGEX = True |
|
|
except Exception: |
|
|
import re |
|
|
HAS_REGEX = False |
|
|
|
|
|
try: |
|
|
from pyzbar.pyzbar import decode as zbar_decode |
|
|
HAS_BARCODE = True |
|
|
except Exception: |
|
|
zbar_decode = None |
|
|
HAS_BARCODE = False |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class Box: |
|
|
y1: int; x1: int; y2: int; x2: int; area: int |
|
|
|
|
|
|
|
|
if HAS_REGEX: |
|
|
|
|
|
_WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE) |
|
|
else: |
|
|
|
|
|
_WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b") |
|
|
|
|
|
if HAS_SPELLCHECK: |
|
|
|
|
|
_SPELL_EN = SpellChecker(language="en") |
|
|
|
|
|
|
|
|
_SPELL_FR = None |
|
|
try: |
|
|
_SPELL_FR = SpellChecker(language="fr") |
|
|
except Exception: |
|
|
|
|
|
try: |
|
|
_SPELL_FR = SpellChecker() |
|
|
|
|
|
except Exception: |
|
|
_SPELL_FR = None |
|
|
print("Warning: French spell checker not available") |
|
|
else: |
|
|
_SPELL_EN = None |
|
|
_SPELL_FR = None |
|
|
|
|
|
_DOMAIN_ALLOWLIST = { |
|
|
|
|
|
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", |
|
|
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid", |
|
|
|
|
|
|
|
|
"CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP", |
|
|
"Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure", |
|
|
"Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock", |
|
|
|
|
|
|
|
|
"Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd", |
|
|
"USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU", |
|
|
|
|
|
|
|
|
"Québec", "Montréal", "Toronto", "Vancouver", "Ottawa", "Calgary", |
|
|
"français", "française", "anglais", "anglaise", "bilingue", |
|
|
|
|
|
|
|
|
"Marketing", "Sales", "Customer", "Service", "Quality", "Control", |
|
|
"Management", "Administration", "Production", "Manufacturing", |
|
|
"Distribution", "Logistics", "Supply", "Chain", "Inventory", |
|
|
|
|
|
|
|
|
"Email", "Website", "Online", "Internet", "Software", "Hardware", |
|
|
"Database", "System", "Network", "Server", "Client", "User", |
|
|
"Password", "Login", "Logout", "Account", "Profile", "Settings", |
|
|
"Configuration", "Installation", "Maintenance", "Support", |
|
|
|
|
|
|
|
|
"mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in", |
|
|
"x", "by", "times", "multiply", "divide", "plus", "minus", |
|
|
|
|
|
|
|
|
"colour", "colour", "favour", "favour", "honour", "honour", |
|
|
"behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre", |
|
|
"theatre", "theatre", "metre", "metre", "litre", "litre", |
|
|
|
|
|
|
|
|
"glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate", |
|
|
"ascorbic", "ascorbate", "retinol", "retinyl", "palmitate", |
|
|
"stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate", |
|
|
"eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega", |
|
|
"hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl", |
|
|
"phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide", |
|
|
"sodium", "potassium", "calcium", "magnesium", "zinc", "iron", |
|
|
"copper", "manganese", "selenium", "chromium", "molybdenum", |
|
|
"thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine", |
|
|
"biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol", |
|
|
"phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine", |
|
|
"creatine", "taurine", "glutamine", "arginine", "lysine", "leucine", |
|
|
"isoleucine", "valine", "phenylalanine", "tryptophan", "methionine", |
|
|
"cysteine", "tyrosine", "histidine", "proline", "serine", "threonine", |
|
|
"asparagine", "glutamic", "aspartic", "alanine", "glycine", |
|
|
"polysorbate", "monostearate", "distearate", "tristearate", |
|
|
"polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl", |
|
|
"cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose", |
|
|
"microcrystalline", "lactose", "sucrose", "dextrose", "fructose", |
|
|
"maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol", |
|
|
"stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic", |
|
|
"eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic", |
|
|
"conjugated", "linoleic", "acid", "ester", "amide", "anhydride", |
|
|
"hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate", |
|
|
"phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate", |
|
|
"malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate", |
|
|
"palmitate", "stearate", "oleate", "linoleate", "arachidonate" |
|
|
} |
|
|
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST} |
|
|
|
|
|
if _SPELL_EN: |
|
|
_SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) |
|
|
if _SPELL_FR: |
|
|
_SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) |
|
|
|
|
|
def _normalize_text(s: str) -> str: |
|
|
"""Normalize text for better word extraction""" |
|
|
if not s: |
|
|
return "" |
|
|
|
|
|
|
|
|
s = unicodedata.normalize("NFC", s) |
|
|
|
|
|
|
|
|
s = s.replace("'", "'").replace("'", "'") |
|
|
|
|
|
|
|
|
s = re.sub(r'\s+', ' ', s) |
|
|
|
|
|
|
|
|
s = s.strip() |
|
|
|
|
|
return s |
|
|
|
|
|
def _extract_tokens(raw: str): |
|
|
"""Extract word tokens with improved filtering""" |
|
|
s = _normalize_text(raw or "") |
|
|
tokens = _WORD_RE.findall(s) |
|
|
|
|
|
|
|
|
filtered_tokens = [] |
|
|
for token in tokens: |
|
|
if len(token) >= 2 and _is_likely_word(token): |
|
|
filtered_tokens.append(token) |
|
|
|
|
|
return filtered_tokens |
|
|
|
|
|
def _looks_like_acronym(tok: str) -> bool: |
|
|
"""Check if token looks like a valid acronym""" |
|
|
return tok.isupper() and 2 <= len(tok) <= 6 |
|
|
|
|
|
def _has_digits(tok: str) -> bool: |
|
|
"""Check if token contains digits""" |
|
|
return any(ch.isdigit() for ch in tok) |
|
|
|
|
|
def _is_mostly_numbers(tok: str) -> bool: |
|
|
"""Check if token is mostly numbers (should be ignored)""" |
|
|
if not tok: |
|
|
return False |
|
|
|
|
|
|
|
|
digit_count = sum(1 for ch in tok if ch.isdigit()) |
|
|
letter_count = sum(1 for ch in tok if ch.isalpha()) |
|
|
total_chars = len(tok) |
|
|
|
|
|
|
|
|
if digit_count / total_chars > 0.7: |
|
|
return True |
|
|
|
|
|
|
|
|
if digit_count == total_chars: |
|
|
return True |
|
|
|
|
|
|
|
|
if total_chars >= 2 and digit_count >= 1: |
|
|
suffix = tok[-2:].lower() |
|
|
if suffix in ['st', 'nd', 'rd', 'th']: |
|
|
return True |
|
|
|
|
|
|
|
|
if '.' in tok and digit_count > 0: |
|
|
return True |
|
|
|
|
|
|
|
|
if tok.endswith('%') and digit_count > 0: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def _is_likely_word(tok: str) -> bool: |
|
|
"""Check if token looks like a real word (not random characters)""" |
|
|
if len(tok) < 2: |
|
|
return False |
|
|
|
|
|
|
|
|
letter_count = sum(1 for c in tok if c.isalpha()) |
|
|
if letter_count < len(tok) * 0.6: |
|
|
return False |
|
|
|
|
|
|
|
|
vowels = set('aeiouAEIOU') |
|
|
consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ') |
|
|
|
|
|
|
|
|
if len(tok) >= 4: |
|
|
consonant_clusters = 0 |
|
|
vowel_clusters = 0 |
|
|
for i in range(len(tok) - 2): |
|
|
if tok[i:i+3].lower() in consonants: |
|
|
consonant_clusters += 1 |
|
|
if tok[i:i+3].lower() in vowels: |
|
|
vowel_clusters += 1 |
|
|
|
|
|
|
|
|
if consonant_clusters > len(tok) * 0.3: |
|
|
return False |
|
|
|
|
|
|
|
|
keyboard_patterns = [ |
|
|
'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm', |
|
|
'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk', |
|
|
'123456', '234567', '345678', '456789', '567890' |
|
|
] |
|
|
|
|
|
tok_lower = tok.lower() |
|
|
for pattern in keyboard_patterns: |
|
|
if pattern in tok_lower or tok_lower in pattern: |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def _is_known_word(tok: str) -> bool: |
|
|
"""Check if token is a known word with comprehensive filtering""" |
|
|
t = tok.lower() |
|
|
|
|
|
|
|
|
if not _is_likely_word(tok): |
|
|
return True |
|
|
|
|
|
|
|
|
if _is_mostly_numbers(tok): |
|
|
return True |
|
|
|
|
|
|
|
|
if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok): |
|
|
return True |
|
|
|
|
|
|
|
|
if '-' in tok: |
|
|
parts = tok.split('-') |
|
|
if all(_is_known_word(part) for part in parts): |
|
|
return True |
|
|
|
|
|
|
|
|
if _SPELL_EN: |
|
|
try: |
|
|
|
|
|
if not _SPELL_EN.unknown([t]): |
|
|
return True |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
if _SPELL_FR: |
|
|
try: |
|
|
|
|
|
if not _SPELL_FR.unknown([t]): |
|
|
return True |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible'] |
|
|
common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down'] |
|
|
|
|
|
|
|
|
for suffix in common_suffixes: |
|
|
if t.endswith(suffix) and len(t) > len(suffix) + 2: |
|
|
base_word = t[:-len(suffix)] |
|
|
if _SPELL_EN and not _SPELL_EN.unknown([base_word]): |
|
|
return True |
|
|
|
|
|
for prefix in common_prefixes: |
|
|
if t.startswith(prefix) and len(t) > len(prefix) + 2: |
|
|
base_word = t[len(prefix):] |
|
|
if _SPELL_EN and not _SPELL_EN.unknown([base_word]): |
|
|
return True |
|
|
|
|
|
|
|
|
if t.endswith('s') and len(t) > 3: |
|
|
singular = t[:-1] |
|
|
if _SPELL_EN and not _SPELL_EN.unknown([singular]): |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
def normalize_token(token: str) -> str: |
|
|
toks = _extract_tokens(token) |
|
|
return (toks[0].lower() if toks else "") |
|
|
|
|
|
|
|
|
def _is_pdf(path: str) -> bool: |
|
|
return os.path.splitext(path.lower())[1] == ".pdf" |
|
|
|
|
|
def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool: |
|
|
""" |
|
|
Check if a box is in the excluded bottom area (115mm from bottom). |
|
|
Converts mm to pixels using DPI. |
|
|
""" |
|
|
|
|
|
excluded_height_pixels = int(excluded_height_mm * dpi / 25.4) |
|
|
|
|
|
|
|
|
excluded_top = image_height - excluded_height_pixels |
|
|
|
|
|
|
|
|
return box.y1 >= excluded_top |
|
|
|
|
|
def _contains_validation_text(text: str) -> bool: |
|
|
"""Check if text contains the validation text '50 Carroll'""" |
|
|
return "50 Carroll" in text |
|
|
|
|
|
def load_pdf_pages(path: str, dpi: int = 600, max_pages: int = 15) -> List[Image.Image]: |
|
|
"""Load PDF pages as images with fallback options""" |
|
|
if not _is_pdf(path): |
|
|
return [Image.open(path).convert("RGB")] |
|
|
|
|
|
|
|
|
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None] |
|
|
|
|
|
for poppler_path in poppler_paths: |
|
|
try: |
|
|
if poppler_path: |
|
|
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path) |
|
|
else: |
|
|
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages) |
|
|
|
|
|
if imgs: |
|
|
return [img.convert("RGB") for img in imgs] |
|
|
except Exception: |
|
|
if poppler_path is None: |
|
|
break |
|
|
continue |
|
|
|
|
|
|
|
|
if HAS_PYMUPDF: |
|
|
try: |
|
|
doc = fitz.open(path) |
|
|
pages = [] |
|
|
for page_num in range(min(len(doc), max_pages)): |
|
|
page = doc[page_num] |
|
|
mat = fitz.Matrix(dpi/72, dpi/72) |
|
|
pix = page.get_pixmap(matrix=mat) |
|
|
img_data = pix.tobytes("ppm") |
|
|
img = Image.open(io.BytesIO(img_data)) |
|
|
pages.append(img.convert("RGB")) |
|
|
doc.close() |
|
|
return pages |
|
|
except Exception as e: |
|
|
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}") |
|
|
|
|
|
raise ValueError("Failed to convert PDF to image. No working method available.") |
|
|
|
|
|
def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image: |
|
|
"""Combine multiple pages into a single vertical image""" |
|
|
if not pages: |
|
|
raise ValueError("No pages to combine") |
|
|
if len(pages) == 1: |
|
|
return pages[0] |
|
|
|
|
|
|
|
|
max_width = max(page.width for page in pages) |
|
|
|
|
|
|
|
|
total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1) |
|
|
|
|
|
|
|
|
combined = Image.new('RGB', (max_width, total_height), (255, 255, 255)) |
|
|
|
|
|
y_offset = 0 |
|
|
for page in pages: |
|
|
|
|
|
x_offset = (max_width - page.width) // 2 |
|
|
combined.paste(page, (x_offset, y_offset)) |
|
|
y_offset += page.height + spacing |
|
|
|
|
|
return combined |
|
|
|
|
|
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: |
|
|
if a.size == b.size: |
|
|
return a, b |
|
|
w, h = min(a.width, b.width), min(a.height, b.height) |
|
|
return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) |
|
|
|
|
|
def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: |
|
|
return ImageChops.difference(a, b) |
|
|
|
|
|
def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: |
|
|
arr = np.asarray(diff_img).astype(np.uint16) |
|
|
gray = arr.max(axis=2).astype(np.uint8) |
|
|
mask = (gray >= threshold).astype(np.uint8) |
|
|
mask = dilation(mask, rectangle(3, 3)) |
|
|
labeled = label(mask, connectivity=2) |
|
|
out: List[Box] = [] |
|
|
img_height = diff_img.height |
|
|
|
|
|
for p in regionprops(labeled): |
|
|
if p.area < min_area: |
|
|
continue |
|
|
minr, minc, maxr, maxc = p.bbox |
|
|
box = Box(minr, minc, maxr, maxc, int(p.area)) |
|
|
|
|
|
|
|
|
if _is_in_excluded_bottom_area(box, img_height): |
|
|
continue |
|
|
|
|
|
out.append(box) |
|
|
return out |
|
|
|
|
|
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, |
|
|
width: int = 3) -> Image.Image: |
|
|
out = img.copy(); d = ImageDraw.Draw(out) |
|
|
|
|
|
for b in red_boxes: |
|
|
for w in range(width): |
|
|
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) |
|
|
|
|
|
for b in cyan_boxes: |
|
|
for w in range(width): |
|
|
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) |
|
|
|
|
|
if green_boxes: |
|
|
for b in green_boxes: |
|
|
for w in range(width): |
|
|
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) |
|
|
return out |
|
|
|
|
|
def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: |
|
|
A = np.asarray(a).copy(); B = np.asarray(b) |
|
|
mask = np.any(A != B, axis=2) |
|
|
A[mask] = [255, 0, 0] |
|
|
return Image.fromarray(A) |
|
|
|
|
|
|
|
|
from typing import List, Iterable, Optional |
|
|
from PIL import Image |
|
|
import unicodedata |
|
|
import regex as re |
|
|
import pytesseract |
|
|
from spellchecker import SpellChecker |
|
|
|
|
|
|
|
|
try: |
|
|
HAS_OCR |
|
|
except NameError: |
|
|
HAS_OCR = True |
|
|
try: |
|
|
HAS_SPELLCHECK |
|
|
except NameError: |
|
|
HAS_SPELLCHECK = True |
|
|
|
|
|
|
|
|
_WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE) |
|
|
|
|
|
_SPELL_EN = SpellChecker(language="en") |
|
|
_SPELL_FR = SpellChecker(language="fr") |
|
|
|
|
|
_DOMAIN_ALLOWLIST = { |
|
|
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", |
|
|
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" |
|
|
} |
|
|
_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) |
|
|
_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) |
|
|
|
|
|
def _normalize_text(s: str) -> str: |
|
|
s = unicodedata.normalize("NFC", s) |
|
|
return s.replace("'", "'").strip() |
|
|
|
|
|
def _looks_like_acronym(tok: str) -> bool: |
|
|
return tok.isupper() and 2 <= len(tok) <= 6 |
|
|
|
|
|
def _has_digits(tok: str) -> bool: |
|
|
return any(ch.isdigit() for ch in tok) |
|
|
|
|
|
|
|
|
def normalize_token(token: str) -> str: |
|
|
toks = _extract_tokens(token) |
|
|
return (toks[0].lower() if toks else "") |
|
|
|
|
|
def _get_available_tesseract_langs(): |
|
|
"""Get available Tesseract languages""" |
|
|
try: |
|
|
langs = pytesseract.get_languages() |
|
|
if 'eng' in langs and 'fra' in langs: |
|
|
return "eng+fra" |
|
|
elif 'eng' in langs: |
|
|
return "eng" |
|
|
elif langs: |
|
|
return langs[0] |
|
|
else: |
|
|
return "eng" |
|
|
except Exception: |
|
|
return "eng" |
|
|
|
|
|
def prepare_for_ocr(img: Image.Image) -> Image.Image: |
|
|
"""Prepare image for better OCR results""" |
|
|
from PIL import ImageOps, ImageFilter |
|
|
g = img.convert("L") |
|
|
g = ImageOps.autocontrast(g) |
|
|
g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2)) |
|
|
return g |
|
|
|
|
|
def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]: |
|
|
"""Extract text directly from PDF using PyMuPDF""" |
|
|
if not HAS_PYMUPDF: |
|
|
return [] |
|
|
|
|
|
try: |
|
|
doc = fitz.open(path) |
|
|
texts = [] |
|
|
for page_num in range(min(len(doc), max_pages)): |
|
|
page = doc[page_num] |
|
|
text = page.get_text() |
|
|
texts.append(text) |
|
|
doc.close() |
|
|
return texts |
|
|
except Exception: |
|
|
return [] |
|
|
|
|
|
def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000): |
|
|
"""Convert PDF coordinates to image coordinates""" |
|
|
pdf_width, pdf_height = pdf_page_size |
|
|
img_width, img_height = image_size |
|
|
|
|
|
|
|
|
scale_x = img_width / pdf_width |
|
|
scale_y = img_height / pdf_height |
|
|
|
|
|
|
|
|
x1 = int(pdf_bbox[0] * scale_x) |
|
|
y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height) |
|
|
x2 = int(pdf_bbox[2] * scale_x) |
|
|
y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height) |
|
|
|
|
|
return x1, y1, x2, y2 |
|
|
|
|
|
def find_misspell_boxes_from_text( |
|
|
pdf_path: str, |
|
|
*, |
|
|
extra_allow: Optional[Iterable[str]] = None, |
|
|
max_pages: int = 5, |
|
|
image_size: Optional[Tuple[int, int]] = None |
|
|
) -> List[Box]: |
|
|
"""Find misspellings by analyzing extracted PDF text directly with coordinate mapping""" |
|
|
if not (HAS_SPELLCHECK and HAS_PYMUPDF): |
|
|
return [] |
|
|
|
|
|
|
|
|
if extra_allow and _SPELL_EN: |
|
|
_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) |
|
|
if extra_allow and _SPELL_FR: |
|
|
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) |
|
|
|
|
|
boxes: List[Box] = [] |
|
|
|
|
|
try: |
|
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
for page_num in range(min(len(doc), max_pages)): |
|
|
page = doc[page_num] |
|
|
|
|
|
|
|
|
text_dict = page.get_text("dict") |
|
|
|
|
|
|
|
|
for block in text_dict.get("blocks", []): |
|
|
if "lines" not in block: |
|
|
continue |
|
|
|
|
|
for line in block["lines"]: |
|
|
for span in line["spans"]: |
|
|
text = span.get("text", "").strip() |
|
|
if not text: |
|
|
continue |
|
|
|
|
|
|
|
|
tokens = _extract_tokens(text) |
|
|
has_misspelling = False |
|
|
|
|
|
for token in tokens: |
|
|
if len(token) >= 2 and not _is_known_word(token): |
|
|
has_misspelling = True |
|
|
break |
|
|
|
|
|
|
|
|
if has_misspelling: |
|
|
bbox = span["bbox"] |
|
|
|
|
|
|
|
|
page_rect = page.rect |
|
|
pdf_width = page_rect.width |
|
|
pdf_height = page_rect.height |
|
|
|
|
|
|
|
|
if image_size: |
|
|
img_width, img_height = image_size |
|
|
|
|
|
scale_x = img_width / pdf_width |
|
|
scale_y = img_height / pdf_height |
|
|
x1 = int(bbox[0] * scale_x) |
|
|
y1 = int(bbox[1] * scale_y) + (page_num * img_height) |
|
|
x2 = int(bbox[2] * scale_x) |
|
|
y2 = int(bbox[3] * scale_y) + (page_num * img_height) |
|
|
else: |
|
|
x1 = int(bbox[0]) |
|
|
y1 = int(bbox[1]) + (page_num * 1000) |
|
|
x2 = int(bbox[2]) |
|
|
y2 = int(bbox[3]) + (page_num * 1000) |
|
|
|
|
|
|
|
|
box = Box(y1=y1, x1=x1, y2=y2, x2=x2, area=(x2 - x1) * (y2 - y1)) |
|
|
|
|
|
|
|
|
if image_size: |
|
|
img_height = image_size[1] |
|
|
if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text): |
|
|
continue |
|
|
|
|
|
boxes.append(box) |
|
|
|
|
|
doc.close() |
|
|
|
|
|
except Exception: |
|
|
|
|
|
page_texts = extract_pdf_text(pdf_path, max_pages) |
|
|
for page_num, text in enumerate(page_texts): |
|
|
if not text.strip(): |
|
|
continue |
|
|
|
|
|
tokens = _extract_tokens(text) |
|
|
misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)] |
|
|
|
|
|
if misspelled_words: |
|
|
|
|
|
boxes.append(Box( |
|
|
y1=page_num * 1000, |
|
|
x1=0, |
|
|
y2=(page_num + 1) * 1000, |
|
|
x2=800, |
|
|
area=800 * 1000 |
|
|
)) |
|
|
|
|
|
return boxes |
|
|
|
|
|
def find_misspell_boxes( |
|
|
img: Image.Image, |
|
|
*, |
|
|
min_conf: int = 60, |
|
|
lang: Optional[str] = None, |
|
|
extra_allow: Optional[Iterable[str]] = None, |
|
|
dpi: int = 300, |
|
|
psm: int = 6, |
|
|
oem: int = 3 |
|
|
) -> List[Box]: |
|
|
"""Legacy OCR-based spell checking (kept for fallback)""" |
|
|
if not (HAS_OCR and HAS_SPELLCHECK): |
|
|
return [] |
|
|
|
|
|
|
|
|
if lang is None: |
|
|
try: |
|
|
avail = set(pytesseract.get_languages(config="") or []) |
|
|
except Exception: |
|
|
avail = {"eng"} |
|
|
lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng" |
|
|
|
|
|
|
|
|
|
|
|
if img.width < 1600: |
|
|
scale = 2 |
|
|
img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS) |
|
|
|
|
|
|
|
|
img = prepare_for_ocr(img) |
|
|
|
|
|
try: |
|
|
if extra_allow and _SPELL_EN: |
|
|
_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) |
|
|
if extra_allow and _SPELL_FR: |
|
|
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) |
|
|
|
|
|
|
|
|
config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}" |
|
|
|
|
|
data = pytesseract.image_to_data( |
|
|
img, |
|
|
lang=lang, |
|
|
config=config, |
|
|
output_type=pytesseract.Output.DICT, |
|
|
) |
|
|
except Exception: |
|
|
return [] |
|
|
|
|
|
n = len(data.get("text", [])) or 0 |
|
|
boxes: List[Box] = [] |
|
|
|
|
|
for i in range(n): |
|
|
raw = data["text"][i] |
|
|
if not raw: |
|
|
continue |
|
|
|
|
|
|
|
|
conf_str = data.get("conf", ["-1"])[i] |
|
|
try: |
|
|
conf = int(float(conf_str)) |
|
|
except Exception: |
|
|
conf = -1 |
|
|
if conf < min_conf: |
|
|
continue |
|
|
|
|
|
tokens = _extract_tokens(raw) |
|
|
if not tokens: |
|
|
continue |
|
|
|
|
|
|
|
|
if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens): |
|
|
continue |
|
|
|
|
|
left = data.get("left", [0])[i] |
|
|
top = data.get("top", [0])[i] |
|
|
width = data.get("width", [0])[i] |
|
|
height = data.get("height",[0])[i] |
|
|
if width <= 0 or height <= 0: |
|
|
continue |
|
|
|
|
|
|
|
|
b = Box(top, left, top + height, left + width, width * height) |
|
|
|
|
|
if _is_in_excluded_bottom_area(b, img.height) and not _contains_validation_text(raw): |
|
|
continue |
|
|
boxes.append(b) |
|
|
|
|
|
return boxes |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import io, regex as re |
|
|
from typing import List, Tuple, Dict, Any |
|
|
from PIL import Image, ImageOps |
|
|
import numpy as np |
|
|
|
|
|
import fitz |
|
|
|
|
|
|
|
|
try: |
|
|
import zxingcpp; HAS_ZXING=True |
|
|
except Exception: HAS_ZXING=False |
|
|
try: |
|
|
from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol; HAS_ZBAR=True |
|
|
except Exception: HAS_ZBAR=False; ZBarSymbol=None |
|
|
try: |
|
|
from pylibdmtx.pylibdmtx import decode as dmtx_decode; HAS_DMTX=True |
|
|
except Exception: HAS_DMTX=False |
|
|
try: |
|
|
import cv2; HAS_CV2=True |
|
|
except Exception: HAS_CV2=False |
|
|
|
|
|
|
|
|
|
|
|
def _binarize(img: Image.Image) -> Image.Image: |
|
|
g = ImageOps.grayscale(img) |
|
|
g = ImageOps.autocontrast(g) |
|
|
return g.point(lambda x: 255 if x > 140 else 0, mode="1").convert("L") |
|
|
|
|
|
def _ean_checksum_ok(d: str) -> bool: |
|
|
if not d.isdigit(): return False |
|
|
n=len(d); nums=list(map(int,d)) |
|
|
if n==8: |
|
|
return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(7))%10))%10==nums[7] |
|
|
if n==12: |
|
|
return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(11))%10))%10==nums[11] |
|
|
if n==13: |
|
|
return (10 - (sum(nums[i]*(1 if i%2==0 else 3) for i in range(12))%10))%10==nums[12] |
|
|
return True |
|
|
|
|
|
def _normalize_upc_ean(sym: str, text: str): |
|
|
digits = re.sub(r"\D","",text or "") |
|
|
s = (sym or "").upper() |
|
|
if s in ("EAN13","EAN-13") and len(digits)==13 and digits.startswith("0"): |
|
|
return "UPCA", digits[1:] |
|
|
return s, (digits if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else text or "") |
|
|
|
|
|
def _validate(sym: str, payload: str) -> bool: |
|
|
s, norm = _normalize_upc_ean(sym, payload) |
|
|
return _ean_checksum_ok(norm) if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else bool(payload) |
|
|
|
|
|
def _decode_zxing(pil: Image.Image) -> List[Dict[str,Any]]: |
|
|
if not HAS_ZXING: return [] |
|
|
arr = np.asarray(pil.convert("L")) |
|
|
out=[] |
|
|
for r in zxingcpp.read_barcodes(arr): |
|
|
|
|
|
x1=y1=x2=y2=w=h=0 |
|
|
pos = getattr(r, "position", None) |
|
|
pts: List[Any] = [] |
|
|
if pos is not None: |
|
|
try: |
|
|
pts = list(pos) |
|
|
except TypeError: |
|
|
|
|
|
corner_names = ( |
|
|
"top_left", "topLeft", |
|
|
"top_right", "topRight", |
|
|
"bottom_left", "bottomLeft", |
|
|
"bottom_right", "bottomRight", |
|
|
"point1", "point2", "point3", "point4", |
|
|
) |
|
|
seen=set() |
|
|
for name in corner_names: |
|
|
if hasattr(pos, name): |
|
|
p = getattr(pos, name) |
|
|
|
|
|
if id(p) not in seen and hasattr(p, "x") and hasattr(p, "y"): |
|
|
pts.append(p) |
|
|
seen.add(id(p)) |
|
|
if pts: |
|
|
xs=[int(getattr(p, "x", 0)) for p in pts] |
|
|
ys=[int(getattr(p, "y", 0)) for p in pts] |
|
|
x1,x2=min(xs),max(xs); y1,y2=min(ys),max(ys) |
|
|
w,h=x2-x1,y2-y1 |
|
|
out.append({ |
|
|
"type": str(r.format), |
|
|
"data": r.text or "", |
|
|
"left": x1, |
|
|
"top": y1, |
|
|
"width": w, |
|
|
"height": h, |
|
|
}) |
|
|
return out |
|
|
|
|
|
def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]: |
|
|
if not HAS_ZBAR: return [] |
|
|
syms=[ZBarSymbol.QRCODE,ZBarSymbol.EAN13,ZBarSymbol.EAN8,ZBarSymbol.UPCA,ZBarSymbol.CODE128] if ZBarSymbol else None |
|
|
res=zbar_decode(pil, symbols=syms) if syms else zbar_decode(pil) |
|
|
return [{"type": d.type, "data": (d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)), |
|
|
"left": d.rect.left, "top": d.rect.top, "width": d.rect.width, "height": d.rect.height} for d in res] |
|
|
|
|
|
def _decode_dmtx(pil: Image.Image) -> List[Dict[str,Any]]: |
|
|
if not HAS_DMTX: return [] |
|
|
try: |
|
|
res=dmtx_decode(ImageOps.grayscale(pil)) |
|
|
return [{"type":"DATAMATRIX","data": r.data.decode("utf-8","ignore"), |
|
|
"left": r.rect.left, "top": r.rect.top, "width": r.rect.width, "height": r.rect.height} for r in res] |
|
|
except Exception: |
|
|
return [] |
|
|
|
|
|
def _decode_cv2_qr(pil: Image.Image) -> List[Dict[str,Any]]: |
|
|
if not HAS_CV2: return [] |
|
|
try: |
|
|
det=cv2.QRCodeDetector() |
|
|
g=np.asarray(pil.convert("L")) |
|
|
val, pts, _ = det.detectAndDecode(g) |
|
|
if val: |
|
|
if pts is not None and len(pts)>=1: |
|
|
pts=pts.reshape(-1,2); xs,ys=pts[:,0],pts[:,1] |
|
|
x1,x2=int(xs.min()),int(xs.max()); y1,y2=int(ys.min()),int(ys.max()) |
|
|
w,h=x2-x1,y2-y1 |
|
|
else: |
|
|
x1=y1=w=h=0 |
|
|
return [{"type":"QRCODE","data":val,"left":x1,"top":y1,"width":w,"height":h}] |
|
|
except Exception: |
|
|
pass |
|
|
return [] |
|
|
|
|
|
def _decode_variants(pil: Image.Image) -> List[Dict[str,Any]]: |
|
|
variants=[pil, ImageOps.grayscale(pil), _binarize(pil)] |
|
|
|
|
|
w,h=pil.size |
|
|
if max(w,h)<1600: |
|
|
up=pil.resize((w*2,h*2), resample=Image.NEAREST) |
|
|
variants += [up, _binarize(up)] |
|
|
for v in variants: |
|
|
|
|
|
res = _decode_zxing(v) |
|
|
if res: return res |
|
|
res = _decode_zbar(v) |
|
|
if res: return res |
|
|
res = _decode_dmtx(v) |
|
|
if res: return res |
|
|
res = _decode_cv2_qr(v) |
|
|
if res: return res |
|
|
|
|
|
for angle in (90,180,270): |
|
|
r=v.rotate(angle, expand=True) |
|
|
res = _decode_zxing(r) or _decode_zbar(r) or _decode_dmtx(r) or _decode_cv2_qr(r) |
|
|
if res: return res |
|
|
return [] |
|
|
|
|
|
def _pix_to_pil(pix) -> Image.Image: |
|
|
|
|
|
if pix.alpha: pix = fitz.Pixmap(pix, 0) |
|
|
try: |
|
|
pix = fitz.Pixmap(fitz.csGRAY, pix) |
|
|
except Exception: |
|
|
pass |
|
|
return Image.open(io.BytesIO(pix.tobytes("png"))) |
|
|
|
|
|
def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10): |
|
|
"""Return (boxes, infos) from both rendered pages and embedded images.""" |
|
|
boxes=[]; infos=[] |
|
|
doc=fitz.open(pdf_path) |
|
|
n=min(len(doc), max_pages) |
|
|
for page_idx in range(n): |
|
|
page=doc[page_idx] |
|
|
|
|
|
|
|
|
for ix,(xref,*_) in enumerate(page.get_images(full=True)): |
|
|
try: |
|
|
pix=fitz.Pixmap(doc, xref) |
|
|
pil=_pix_to_pil(pix) |
|
|
hits=_decode_variants(pil) |
|
|
for r in hits: |
|
|
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) |
|
|
|
|
|
if _is_in_excluded_bottom_area(b, pil.height): |
|
|
continue |
|
|
boxes.append(b) |
|
|
sym, payload = r["type"], r["data"] |
|
|
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"}) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
for dpi in dpi_list: |
|
|
scale=dpi/72.0 |
|
|
try: |
|
|
pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False) |
|
|
except TypeError: |
|
|
pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False) |
|
|
pil=_pix_to_pil(pix) |
|
|
hits=_decode_variants(pil) |
|
|
for r in hits: |
|
|
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) |
|
|
if _is_in_excluded_bottom_area(b, pil.height): |
|
|
continue |
|
|
boxes.append(b) |
|
|
sym, payload = r["type"], r["data"] |
|
|
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"}) |
|
|
if any(i["page"]==page_idx+1 for i in infos): |
|
|
break |
|
|
doc.close() |
|
|
return boxes, infos |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: |
|
|
return np.asarray(img.convert('CMYK')).astype(np.float32) |
|
|
|
|
|
def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: |
|
|
y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) |
|
|
x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) |
|
|
if y2<=y1 or x2<=x1: |
|
|
return (0.0,0.0,0.0,0.0) |
|
|
region = cmyk_arr[y1:y2, x1:x2, :] |
|
|
mean_vals = region.reshape(-1, 4).mean(axis=0) |
|
|
return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) |
|
|
|
|
|
def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): |
|
|
a_cmyk = rgb_to_cmyk_array(a_img) |
|
|
b_cmyk = rgb_to_cmyk_array(b_img) |
|
|
entries = [] |
|
|
for i, bx in enumerate(red_boxes): |
|
|
a_vals = avg_cmyk_in_box(a_cmyk, bx) |
|
|
b_vals = avg_cmyk_in_box(b_cmyk, bx) |
|
|
delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) |
|
|
entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) |
|
|
return entries |
|
|
|
|
|
def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: |
|
|
w,h = base.size |
|
|
panel = Image.new('RGB', (panel_width, h), (245,245,245)) |
|
|
out = Image.new('RGB', (w+panel_width, h), (255,255,255)) |
|
|
out.paste(base, (0,0)); out.paste(panel, (w,0)) |
|
|
d = ImageDraw.Draw(out) |
|
|
x0 = w + 8; y = 8 |
|
|
d.text((x0, y), title, fill=(0,0,0)); y += 18 |
|
|
if not entries: |
|
|
d.text((x0, y), 'No differing regions', fill=(80,80,80)) |
|
|
return out |
|
|
for e in entries: |
|
|
idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] |
|
|
d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 |
|
|
d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 |
|
|
d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 |
|
|
d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 |
|
|
if y > h - 40: break |
|
|
return out |
|
|
|
|
|
|
|
|
def compare_pdfs(file_a, file_b): |
|
|
"""Main comparison function for Gradio interface""" |
|
|
try: |
|
|
if file_a is None or file_b is None: |
|
|
return None, None, None, "❌ Please upload both PDF files to compare", [], [] |
|
|
|
|
|
|
|
|
pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15) |
|
|
pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15) |
|
|
|
|
|
|
|
|
a = combine_pages_vertically(pages_a) |
|
|
b = combine_pages_vertically(pages_b) |
|
|
|
|
|
|
|
|
a, b = match_sizes(a, b) |
|
|
|
|
|
|
|
|
diff = difference_map(a, b) |
|
|
red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_size = (a.width, a.height) |
|
|
misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else [] |
|
|
misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else [] |
|
|
|
|
|
|
|
|
print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes") |
|
|
|
|
|
if HAS_BARCODE: |
|
|
|
|
|
bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a) |
|
|
bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b) |
|
|
|
|
|
|
|
|
print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes") |
|
|
else: |
|
|
bar_a, info_a = [], [] |
|
|
bar_b, info_b = [], [] |
|
|
|
|
|
|
|
|
cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) |
|
|
|
|
|
|
|
|
a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3) |
|
|
b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3) |
|
|
|
|
|
|
|
|
a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') |
|
|
b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') |
|
|
|
|
|
|
|
|
overlay = make_red_overlay(a, b) |
|
|
|
|
|
|
|
|
status = f""" |
|
|
📊 **Analysis Complete!** |
|
|
- **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)} |
|
|
- **Difference regions found:** {len(red_boxes)} |
|
|
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} |
|
|
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} |
|
|
- **Combined image dimensions:** {a.width} × {a.height} pixels |
|
|
|
|
|
**Legend:** |
|
|
- 🔴 Red boxes: Visual differences |
|
|
- 🔵 Cyan boxes: Spelling errors |
|
|
- 🟢 Green boxes: Barcodes/QR codes |
|
|
""" |
|
|
|
|
|
|
|
|
codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), |
|
|
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] |
|
|
codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), |
|
|
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] |
|
|
|
|
|
return overlay, a_disp, b_disp, status, codes_a, codes_b |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"❌ **Error:** {str(e)}" |
|
|
return None, None, None, error_msg, [], [] |
|
|
|
|
|
|
|
|
def create_demo(): |
|
|
|
|
|
|
|
|
custom_theme = gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
neutral_hue="blue", |
|
|
font=gr.themes.GoogleFont("Inter"), |
|
|
).set( |
|
|
body_background_fill="#99cfe9", |
|
|
body_background_fill_dark="#99cfe9", |
|
|
block_background_fill="#000000", |
|
|
block_background_fill_dark="#000000", |
|
|
border_color_primary="#333333", |
|
|
border_color_primary_dark="#333333", |
|
|
) |
|
|
|
|
|
with gr.Blocks(title="PDF Comparison Tool", theme=custom_theme) as demo: |
|
|
gr.Markdown(""" |
|
|
# 🔍 Advanced PDF Comparison Tool |
|
|
|
|
|
Upload two PDF files to get comprehensive analysis including: |
|
|
- **Multi-page PDF support** (up to 15 pages per document) |
|
|
- **Visual differences** with bounding boxes |
|
|
- **OCR and spell checking** |
|
|
- **Barcode/QR code detection** |
|
|
- **CMYK color analysis** |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"]) |
|
|
file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"]) |
|
|
|
|
|
compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg") |
|
|
|
|
|
status_md = gr.Markdown("") |
|
|
|
|
|
with gr.Row(): |
|
|
overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil") |
|
|
|
|
|
with gr.Row(): |
|
|
img_a = gr.Image(label="📄 File A with Analysis", type="pil") |
|
|
img_b = gr.Image(label="📄 File B with Analysis", type="pil") |
|
|
|
|
|
gr.Markdown("### 📊 Barcode Detection Results") |
|
|
with gr.Row(): |
|
|
codes_a_df = gr.Dataframe( |
|
|
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], |
|
|
label="Barcodes in File A", |
|
|
interactive=False |
|
|
) |
|
|
codes_b_df = gr.Dataframe( |
|
|
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], |
|
|
label="Barcodes in File B", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
compare_btn.click( |
|
|
fn=compare_pdfs, |
|
|
inputs=[file_a, file_b], |
|
|
outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### 📝 Instructions: |
|
|
1. Upload two PDF files |
|
|
2. Click "Compare PDF Files" |
|
|
3. View results with comprehensive analysis |
|
|
|
|
|
### 🎨 Color Legend: |
|
|
- **🔴 Red boxes:** Visual differences between files |
|
|
- **🔵 Cyan boxes:** Potential spelling errors (OCR) |
|
|
- **🟢 Green boxes:** Detected barcodes/QR codes |
|
|
- **📊 Side panel:** CMYK color analysis for print workflows |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
def _binarize(pil_img: Image.Image) -> Image.Image: |
|
|
"""Create a binarized (black/white) version of the image for better barcode detection""" |
|
|
g = ImageOps.grayscale(pil_img) |
|
|
g = ImageOps.autocontrast(g) |
|
|
return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L') |
|
|
|
|
|
def _decode_once(img: Image.Image): |
|
|
"""Single decode attempt with common barcode symbols""" |
|
|
if not HAS_BARCODE: |
|
|
return [] |
|
|
syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128] |
|
|
return zbar_decode(img, symbols=syms) |
|
|
|
|
|
def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2): |
|
|
""" |
|
|
Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues. |
|
|
|
|
|
This function: |
|
|
- Renders pages at 600/900/1200 DPI |
|
|
- Tries grayscale, binarized, and rotated versions |
|
|
- Scans embedded images (XObjects) |
|
|
- Prints what it finds and writes debug PNGs |
|
|
- Helps identify if barcodes are too thin/low resolution |
|
|
|
|
|
Usage: |
|
|
debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2) |
|
|
""" |
|
|
if not (HAS_BARCODE and HAS_PYMUPDF): |
|
|
print("ERROR: Missing dependencies (pyzbar or PyMuPDF)") |
|
|
return |
|
|
|
|
|
os.makedirs(outdir, exist_ok=True) |
|
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
for dpi in (600, 900, 1200): |
|
|
scale = dpi / 72.0 |
|
|
mat = fitz.Matrix(scale, scale) |
|
|
print(f"\n=== DPI {dpi} ===") |
|
|
|
|
|
for p in range(min(len(doc), max_pages)): |
|
|
page = doc[p] |
|
|
pix = page.get_pixmap(matrix=mat, alpha=False) |
|
|
img = Image.open(io.BytesIO(pix.tobytes("ppm"))) |
|
|
img.save(f"{outdir}/page{p+1}_{dpi}.png") |
|
|
|
|
|
|
|
|
variants = [ |
|
|
("orig", img), |
|
|
("gray", ImageOps.grayscale(img)), |
|
|
("bin", _binarize(img)), |
|
|
] |
|
|
found = [] |
|
|
|
|
|
for tag, v in variants: |
|
|
r = _decode_once(v) |
|
|
if r: |
|
|
found.extend((tag, rr.type, rr.data) for rr in r) |
|
|
else: |
|
|
|
|
|
for angle in (90, 180, 270): |
|
|
rr = _decode_once(v.rotate(angle, expand=True)) |
|
|
if rr: |
|
|
found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr) |
|
|
break |
|
|
|
|
|
print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}") |
|
|
|
|
|
|
|
|
imgs = page.get_images(full=True) |
|
|
for ix, (xref, *_) in enumerate(imgs): |
|
|
try: |
|
|
ipix = fitz.Pixmap(doc, xref) |
|
|
if ipix.alpha: |
|
|
ipix = fitz.Pixmap(ipix, 0) |
|
|
pil = Image.open(io.BytesIO(ipix.tobytes("ppm"))) |
|
|
pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png") |
|
|
rr = _decode_once(pil) or _decode_once(_binarize(pil)) |
|
|
if rr: |
|
|
print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}") |
|
|
except Exception as e: |
|
|
print(" Embedded image error:", e) |
|
|
|
|
|
doc.close() |
|
|
print(f"\nDebug images saved to: {outdir}/") |
|
|
print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.") |
|
|
|
|
|
def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10): |
|
|
"""Detect barcodes from the original PDF and return boxes in the same |
|
|
coordinate space as the combined display image. |
|
|
|
|
|
If image_size is provided (w,h of the vertically combined display image), |
|
|
each page is rendered so its width matches w, then decoded. Box y-coordinates |
|
|
are offset by the cumulative height of previous pages so that all boxes map |
|
|
into the combined image space correctly. |
|
|
""" |
|
|
boxes: List[Box] = [] |
|
|
infos: List[Dict[str, Any]] = [] |
|
|
try: |
|
|
doc = fitz.open(pdf_path) |
|
|
num_pages = min(len(doc), max_pages) |
|
|
if num_pages == 0: |
|
|
return [], [] |
|
|
|
|
|
target_width = None |
|
|
if image_size: |
|
|
target_width = int(image_size[0]) |
|
|
|
|
|
y_offset = 0 |
|
|
for page_idx in range(num_pages): |
|
|
page = doc[page_idx] |
|
|
|
|
|
if target_width: |
|
|
page_width_pts = float(page.rect.width) |
|
|
scale = max(1.0, target_width / page_width_pts) |
|
|
else: |
|
|
|
|
|
scale = 600.0 / 72.0 |
|
|
try: |
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False) |
|
|
except TypeError: |
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False) |
|
|
pil = _pix_to_pil(pix) |
|
|
pw, ph = pil.size |
|
|
hits = _decode_variants(pil) |
|
|
for r in hits: |
|
|
x1 = int(r.get("left", 0)) |
|
|
y1 = int(r.get("top", 0)) + y_offset |
|
|
w = int(r.get("width", 0)) |
|
|
h = int(r.get("height", 0)) |
|
|
x2 = x1 + w |
|
|
y2 = y1 + h |
|
|
b = Box(y1, x1, y2, x2, w * h) |
|
|
|
|
|
if image_size and _is_in_excluded_bottom_area(b, image_size[1]): |
|
|
continue |
|
|
if not image_size and _is_in_excluded_bottom_area(b, ph): |
|
|
continue |
|
|
boxes.append(b) |
|
|
sym, payload = r.get("type", ""), r.get("data", "") |
|
|
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx + 1, "source": f"page@scale{scale:.2f}"}) |
|
|
y_offset += ph |
|
|
doc.close() |
|
|
except Exception: |
|
|
return [], [] |
|
|
return boxes, infos |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_demo() |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
share=True, |
|
|
show_error=True |
|
|
) |
|
|
|