ProofCheck / app.py
Yaz Hobooti
Change error message from '50 carroll not found' to 'Invalid File type'
c58dd3d
#!/usr/bin/env python3
"""
Gradio PDF Comparison Tool
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
"""
import os, sys, re, csv, json, io
from dataclasses import dataclass
from typing import List, Tuple, Optional, Iterable
import tempfile
import unicodedata
import numpy as np
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
from pdf2image import convert_from_path
from skimage.measure import label, regionprops
from skimage.morphology import dilation, rectangle
import gradio as gr
# Alternative PDF processing
try:
import fitz # PyMuPDF
HAS_PYMUPDF = True
except Exception:
fitz = None
HAS_PYMUPDF = False
# Optional features
try:
import pytesseract
HAS_OCR = True
except Exception:
pytesseract = None
HAS_OCR = False
try:
from spellchecker import SpellChecker
HAS_SPELLCHECK = True
except Exception:
try:
from pyspellchecker import SpellChecker
HAS_SPELLCHECK = True
except Exception:
SpellChecker = None
HAS_SPELLCHECK = False
try:
import regex as re
HAS_REGEX = True
except Exception:
import re
HAS_REGEX = False
try:
from barcode_reader import read_barcodes_from_path
HAS_BARCODE = True
print("βœ“ Barcode reader imported successfully")
except Exception as e:
read_barcodes_from_path = None
HAS_BARCODE = False
print(f"βœ— Barcode reader import failed: {e}")
# Enable barcode detection if we have ZXing-CPP or pyzbar
if 'HAS_ZXING' in globals() and HAS_ZXING:
HAS_BARCODE = True
print("βœ“ Barcode detection enabled via ZXing-CPP")
elif HAS_BARCODE:
print("βœ“ Barcode detection enabled via pyzbar")
else:
print("βœ— No barcode detection available")
# -------------------- Core Data --------------------
@dataclass
class Box:
y1: int; x1: int; y2: int; x2: int; area: int
# ---- spell/tokenization helpers & caches ----
if HAS_REGEX:
# Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
_WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
else:
# Fallback regex for basic ASCII
_WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
if HAS_SPELLCHECK:
# Initialize English spell checker with comprehensive dictionary
_SPELL_EN = SpellChecker(language="en")
# Try to initialize French spell checker with fallback
_SPELL_FR = None
try:
_SPELL_FR = SpellChecker(language="fr")
except Exception:
# If French dictionary fails, try alternative approach
try:
_SPELL_FR = SpellChecker()
# Load some basic French words manually if needed
except Exception:
_SPELL_FR = None
print("Warning: French spell checker not available")
else:
_SPELL_EN = None
_SPELL_FR = None
_DOMAIN_ALLOWLIST = {
# Company/Brand names
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid",
# Technical terms
"CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP",
"Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure",
"Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock",
# Common abbreviations
"Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd",
"USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU",
# French words (common in Canadian context)
"QuΓ©bec", "MontrΓ©al", "Toronto", "Vancouver", "Ottawa", "Calgary",
"franΓ§ais", "franΓ§aise", "anglais", "anglaise", "bilingue",
# Common business terms
"Marketing", "Sales", "Customer", "Service", "Quality", "Control",
"Management", "Administration", "Production", "Manufacturing",
"Distribution", "Logistics", "Supply", "Chain", "Inventory",
# Common words that might be flagged
"Email", "Website", "Online", "Internet", "Software", "Hardware",
"Database", "System", "Network", "Server", "Client", "User",
"Password", "Login", "Logout", "Account", "Profile", "Settings",
"Configuration", "Installation", "Maintenance", "Support",
# Numbers and measurements
"mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in",
"x", "by", "times", "multiply", "divide", "plus", "minus",
# Common misspellings that are actually correct in context
"colour", "colour", "favour", "favour", "honour", "honour",
"behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
"theatre", "theatre", "metre", "metre", "litre", "litre",
# Cannabis and cannabinoid terms (English)
"cannabis", "cannabinoid", "cannabinoids", "cannabidiol", "cbd", "thc", "tetrahydrocannabinol",
"cannabigerol", "cbg", "cannabinol", "cbn", "cannabichromene", "cbc", "cannabicyclol", "cbl",
"cannabielsoin", "cbe", "cannabitriol", "cbt", "cannabivarin", "cbv", "cannabidivarin", "cbdv",
"tetrahydrocannabivarin", "thcv", "cannabigerovarin", "cbgv", "cannabichromevarin", "cbcv",
"cannabidiolic", "cbda", "tetrahydrocannabinolic", "thca", "cannabigerolic", "cbga",
"cannabichromenic", "cbca", "cannabicyclolic", "cbla", "cannabielsoic", "cbea",
"cannabitriolic", "cbta", "cannabivarinic", "cbva", "cannabidivarinic", "cbdva",
"tetrahydrocannabivarinic", "thcva", "cannabigerovarinic", "cbgva", "cannabichromevarinic", "cbcva",
"terpenes", "terpenoids", "myrcene", "limonene", "pinene", "linalool", "humulene", "caryophyllene",
"terpinolene", "ocimene", "nerolidol", "bisabolol", "eucalyptol", "camphene", "sabinene",
"phytocannabinoids", "endocannabinoids", "anandamide", "arachidonoylglycerol", "2ag",
"cannabinoid", "receptor", "cb1", "cb2", "trpv1", "gpr55", "ppar", "serotonin", "dopamine",
"indica", "sativa", "ruderalis", "hybrid", "hemp", "marijuana", "hashish", "kief", "rosin",
"distillate", "isolate", "full", "spectrum", "broad", "entourage", "effect", "bioavailability",
# Cannabis terms (French)
"cannabis", "cannabinoΓ―de", "cannabinoΓ―des", "cannabidiol", "tΓ©trahydrocannabinol",
"cannabigérol", "cannabinol", "cannabichromène", "cannabicyclol", "cannabielsoin",
"cannabitriol", "cannabivarine", "cannabidivarine", "tΓ©trahydrocannabivarine",
"cannabigΓ©rovarine", "cannabichromΓ©varine", "cannabidiolique", "tΓ©trahydrocannabinolique",
"cannabigΓ©rolique", "cannabichromΓ©nique", "cannabicyclolique", "cannabielsoΓ―que",
"cannabitriolique", "cannabivarinique", "cannabidivarinique", "tΓ©trahydrocannabivarinique",
"cannabigérovarinique", "cannabichromévarinique", "terpènes", "terpénoïdes", "myrcène",
"limonène", "pinène", "linalol", "humulène", "caryophyllène", "terpinolène", "ocimène",
"nérolidol", "bisabolol", "eucalyptol", "camphène", "sabinène", "phytocannabinoïdes",
"endocannabinoΓ―des", "anandamide", "arachidonoylglycΓ©rol", "rΓ©cepteur", "sΓ©rotonine",
"dopamine", "chanvre", "marijuana", "haschisch", "kief", "rosin", "distillat", "isolat",
"spectre", "complet", "large", "effet", "d'entourage", "biodisponibilitΓ©",
# Common pharmaceutical ingredients and terms
"glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate", "ascorbic", "ascorbate",
"retinol", "retinyl", "palmitate", "stearate", "oleate", "linoleate", "arachidonate",
"docosahexaenoate", "eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
"hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl", "heptyl", "octyl",
"nonyl", "decyl", "phenyl", "benzyl", "allyl", "vinyl", "acetyl", "benzoyl", "formyl",
"carboxyl", "carbonyl", "hydroxyl", "amino", "imino", "nitro", "nitroso", "azo",
"phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide", "fluoride",
"sodium", "potassium", "calcium", "magnesium", "zinc", "iron", "copper", "manganese",
"selenium", "chromium", "molybdenum", "cobalt", "nickel", "vanadium", "tungsten",
"thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine", "biotin", "folate",
"cobalamin", "cholecalciferol", "ergocalciferol", "phylloquinone", "menaquinone",
"ubiquinone", "coenzyme", "carnitine", "creatine", "taurine", "glutamine", "arginine",
"lysine", "leucine", "isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
"cysteine", "tyrosine", "histidine", "proline", "serine", "threonine", "asparagine",
"glutamic", "aspartic", "alanine", "glycine", "ornithine", "citrulline", "taurine",
"polysorbate", "monostearate", "distearate", "tristearate", "polyethylene", "polypropylene",
"polyvinyl", "carbomer", "carboxymethyl", "cellulose", "hydroxypropyl", "methylcellulose",
"ethylcellulose", "microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
"maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol", "stearic",
"palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic", "eicosapentaenoic",
"linolenic", "conjugated", "acid", "ester", "amide", "anhydride", "hydrochloride",
"hydrobromide", "hydroiodide", "citrate", "tartrate", "succinate", "fumarate", "malate",
"lactate", "gluconate", "pamoate", "mesylate", "tosylate", "besylate", "edisylate",
"estolate", "stearate", "palmitate", "oleate", "linoleate", "arachidonate", "butyrate",
"valerate", "caproate", "caprylate", "caprate", "laurate", "myristate", "palmitoleate",
"vaccenate", "gadoleate", "erucate", "nervonate", "lignocerate", "cerotate", "montanate",
"melissate", "laccerate", "psyllate", "juniperate", "sabinate", "abietate", "pimarate",
"sandaracopimarate", "isopimarate", "levopimarate", "dehydroabietate", "neoabietate",
"palustrate", "pimarenate", "sandaracopimarenate", "isopimarenate", "levopimarenate",
"dehydroabietate", "neoabietate", "palustrate", "pimarenate", "sandaracopimarenate",
"isopimarenate", "levopimarenate", "dehydroabietate", "neoabietate", "palustrate",
# Common pharmaceutical excipients and additives
"stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
"eicosapentaenoic", "linolenic", "conjugated", "linoleic", "acid", "ester", "amide",
"anhydride", "hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
"phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate", "malate",
"lactate", "gluconate", "ascorbate", "tocopheryl", "acetate", "palmitate", "stearate",
"oleate", "linoleate", "arachidonate", "butyrate", "valerate", "caproate", "caprylate",
"caprate", "laurate", "myristate", "palmitoleate", "vaccenate", "gadoleate", "erucate",
"nervonate", "lignocerate", "cerotate", "montanate", "melissate", "laccerate", "psyllate",
"juniperate", "sabinate", "abietate", "pimarate", "sandaracopimarate", "isopimarate",
"levopimarate", "dehydroabietate", "neoabietate", "palustrate", "pimarenate",
"sandaracopimarenate", "isopimarenate", "levopimarenate", "dehydroabietate", "neoabietate",
"palustrate", "pimarenate", "sandaracopimarenate", "isopimarenate", "levopimarenate",
"dehydroabietate", "neoabietate", "palustrate", "pimarenate", "sandaracopimarenate",
"isopimarenate", "levopimarenate", "dehydroabietate", "neoabietate", "palustrate",
# Common pharmaceutical terms and abbreviations
"mg", "mcg", "iu", "units", "tablets", "capsules", "softgels", "gummies", "tinctures",
"oils", "creams", "lotions", "gels", "patches", "injections", "syrups", "suspensions",
"emulsions", "solutions", "powders", "granules", "pellets", "beads", "microspheres",
"liposomes", "nanoparticles", "micelles", "vesicles", "cyclodextrins", "dendrimers",
"bioavailability", "pharmacokinetics", "pharmacodynamics", "metabolism", "elimination",
"half", "life", "clearance", "volume", "distribution", "protein", "binding", "first",
"pass", "effect", "cytochrome", "p450", "cyp", "enzymes", "inducers", "inhibitors",
"substrates", "metabolites", "prodrugs", "active", "metabolites", "inactive", "metabolites",
"therapeutic", "index", "margin", "safety", "efficacy", "potency", "affinity", "selectivity",
"specificity", "receptor", "binding", "agonists", "antagonists", "partial", "agonists",
"inverse", "agonists", "allosteric", "modulators", "positive", "allosteric", "modulators",
"negative", "allosteric", "modulators", "competitive", "antagonists", "non", "competitive",
"antagonists", "irreversible", "antagonists", "reversible", "antagonists", "uncompetitive",
"antagonists", "mixed", "antagonists", "surmountable", "antagonists", "insurmountable",
"antagonists", "dose", "response", "curves", "ec50", "ic50", "ed50", "ld50", "td50",
"noael", "loael", "adme", "absorption", "distribution", "metabolism", "excretion",
"elimination", "clearance", "renal", "clearance", "hepatic", "clearance", "total",
"clearance", "systemic", "clearance", "apparent", "clearance", "intrinsic", "clearance",
"hepatic", "extraction", "ratio", "first", "pass", "metabolism", "presystemic",
"metabolism", "gut", "wall", "metabolism", "liver", "metabolism", "plasma", "protein",
"binding", "albumin", "binding", "alpha", "acid", "glycoprotein", "binding", "globulin",
"binding", "lipoprotein", "binding", "erythrocyte", "binding", "tissue", "binding",
"partition", "coefficient", "log", "p", "octanol", "water", "partition", "coefficient",
"membrane", "permeability", "intestinal", "permeability", "blood", "brain", "barrier",
"permeability", "placental", "permeability", "milk", "permeability", "skin", "permeability",
"corneal", "permeability", "nasal", "permeability", "pulmonary", "permeability",
"buccal", "permeability", "sublingual", "permeability", "rectal", "permeability",
"vaginal", "permeability", "transdermal", "permeability", "iontophoretic", "permeability",
"electroporation", "permeability", "sonophoresis", "permeability", "microneedle",
"permeability", "nanoparticle", "permeability", "liposome", "permeability", "micelle",
"permeability", "cyclodextrin", "permeability", "dendrimer", "permeability", "vesicle",
"permeability", "nanocapsule", "permeability", "nanosphere", "permeability", "microsphere",
"permeability", "microcapsule", "permeability", "nanocrystal", "permeability", "nanosuspension",
"permeability", "microemulsion", "permeability", "nanoemulsion", "permeability", "solid",
"lipid", "nanoparticle", "permeability", "nanostructured", "lipid", "carrier", "permeability",
"self", "microemulsifying", "drug", "delivery", "system", "permeability", "self",
"nanoemulsifying", "drug", "delivery", "system", "permeability", "liquid", "crystalline",
"nanoparticle", "permeability", "cubosome", "permeability", "hexosome", "permeability",
"sponge", "phase", "permeability", "bicontinuous", "microemulsion", "permeability",
"water", "oil", "microemulsion", "permeability", "oil", "water", "microemulsion",
"permeability", "water", "oil", "water", "microemulsion", "permeability", "oil", "water",
"oil", "microemulsion", "permeability", "multiple", "emulsion", "permeability", "pickering",
"emulsion", "permeability", "janus", "particle", "permeability", "core", "shell",
"nanoparticle", "permeability", "hollow", "nanoparticle", "permeability", "porous",
"nanoparticle", "permeability", "mesoporous", "nanoparticle", "permeability", "macroporous",
"nanoparticle", "permeability", "microporous", "nanoparticle", "permeability", "hierarchical",
"porous", "nanoparticle", "permeability", "ordered", "mesoporous", "nanoparticle",
"permeability", "disordered", "mesoporous", "nanoparticle", "permeability", "periodic",
"mesoporous", "nanoparticle", "permeability", "aperiodic", "mesoporous", "nanoparticle",
"permeability", "crystalline", "mesoporous", "nanoparticle", "permeability", "amorphous",
"mesoporous", "nanoparticle", "permeability", "hybrid", "mesoporous", "nanoparticle",
"permeability", "organic", "inorganic", "hybrid", "mesoporous", "nanoparticle", "permeability",
"metal", "organic", "framework", "permeability", "covalent", "organic", "framework",
"permeability", "zeolitic", "imidazolate", "framework", "permeability", "coordination",
"polymer", "permeability", "supramolecular", "polymer", "permeability", "dendrimer",
"permeability", "hyperbranched", "polymer", "permeability", "star", "polymer", "permeability",
"comb", "polymer", "permeability", "brush", "polymer", "permeability", "graft", "polymer",
"permeability", "block", "copolymer", "permeability", "random", "copolymer", "permeability",
"alternating", "copolymer", "permeability", "statistical", "copolymer", "permeability",
"gradient", "copolymer", "permeability", "periodic", "copolymer", "permeability",
"aperiodic", "copolymer", "permeability", "stereoregular", "copolymer", "permeability",
"stereoirregular", "copolymer", "permeability", "tacticity", "permeability", "isotactic",
"permeability", "syndiotactic", "permeability", "atactic", "permeability", "stereoblock",
"copolymer", "permeability", "stereogradient", "copolymer", "permeability", "stereoperiodic",
"copolymer", "permeability", "stereoaperiodic", "copolymer", "permeability", "stereoregular",
"block", "copolymer", "permeability", "stereoirregular", "block", "copolymer", "permeability",
"stereoregular", "random", "copolymer", "permeability", "stereoirregular", "random",
"copolymer", "permeability", "stereoregular", "alternating", "copolymer", "permeability",
"stereoirregular", "alternating", "copolymer", "permeability", "stereoregular", "statistical",
"copolymer", "permeability", "stereoirregular", "statistical", "copolymer", "permeability",
"stereoregular", "gradient", "copolymer", "permeability", "stereoirregular", "gradient",
"copolymer", "permeability", "stereoregular", "periodic", "copolymer", "permeability",
"stereoirregular", "periodic", "copolymer", "permeability", "stereoregular", "aperiodic",
"copolymer", "permeability", "stereoirregular", "aperiodic", "copolymer", "permeability"
}
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
if _SPELL_EN:
_SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
if _SPELL_FR:
_SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
def _normalize_text(s: str) -> str:
"""Normalize text for better word extraction"""
if not s:
return ""
# Unicode normalization
s = unicodedata.normalize("NFC", s)
# Fix common apostrophe issues
s = s.replace("'", "'").replace("'", "'")
# Normalize whitespace - replace multiple spaces with single space
s = re.sub(r'\s+', ' ', s)
# Remove leading/trailing whitespace
s = s.strip()
return s
def _extract_tokens(raw: str):
"""Extract word tokens with improved filtering"""
s = _normalize_text(raw or "")
tokens = _WORD_RE.findall(s)
# Filter out tokens that are too short or don't look like words
filtered_tokens = []
for token in tokens:
if len(token) >= 2 and _is_likely_word(token):
filtered_tokens.append(token)
return filtered_tokens
def _looks_like_acronym(tok: str) -> bool:
"""Check if token looks like a valid acronym"""
return tok.isupper() and 2 <= len(tok) <= 6
def _has_digits(tok: str) -> bool:
"""Check if token contains digits"""
return any(ch.isdigit() for ch in tok)
def _is_mostly_numbers(tok: str) -> bool:
"""Check if token is mostly numbers (should be ignored)"""
if not tok:
return False
# Count digits and letters
digit_count = sum(1 for ch in tok if ch.isdigit())
letter_count = sum(1 for ch in tok if ch.isalpha())
total_chars = len(tok)
# If more than 70% digits, consider it mostly numbers
if digit_count / total_chars > 0.7:
return True
# If it's a pure number (all digits), ignore it
if digit_count == total_chars:
return True
# If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
if total_chars >= 2 and digit_count >= 1:
suffix = tok[-2:].lower()
if suffix in ['st', 'nd', 'rd', 'th']:
return True
# If it's a decimal number (contains digits and decimal point)
if '.' in tok and digit_count > 0:
return True
# If it's a percentage (ends with %)
if tok.endswith('%') and digit_count > 0:
return True
return False
def _is_likely_word(tok: str) -> bool:
"""Check if token looks like a real word (not random characters)"""
if len(tok) < 2:
return False
# Filter out tokens that are mostly non-letter characters
letter_count = sum(1 for c in tok if c.isalpha())
if letter_count < len(tok) * 0.6: # At least 60% letters
return False
# Filter out tokens with too many consecutive consonants/vowels
vowels = set('aeiouAEIOU')
consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
# Check for excessive consonant clusters (like "qwerty" or "zxcvb")
if len(tok) >= 4:
consonant_clusters = 0
vowel_clusters = 0
for i in range(len(tok) - 2):
if tok[i:i+3].lower() in consonants:
consonant_clusters += 1
if tok[i:i+3].lower() in vowels:
vowel_clusters += 1
# If more than half the possible clusters are consonant clusters, likely not a word
if consonant_clusters > len(tok) * 0.3:
return False
# Filter out tokens that look like random keyboard patterns
keyboard_patterns = [
'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
'123456', '234567', '345678', '456789', '567890'
]
tok_lower = tok.lower()
for pattern in keyboard_patterns:
if pattern in tok_lower or tok_lower in pattern:
return False
return True
def _is_known_word(tok: str) -> bool:
"""Check if token is a known word with comprehensive filtering"""
t = tok.lower()
# First check if it looks like a real word
if not _is_likely_word(tok):
return True # Don't flag non-words as misspellings
# Ignore numbers and mostly numeric tokens
if _is_mostly_numbers(tok):
return True # Don't flag numbers as misspellings
# Check domain allowlist, acronyms, and words with digits
if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
return True
# Check hyphenated words - if any part is known, consider the whole word known
if '-' in tok:
parts = tok.split('-')
if all(_is_known_word(part) for part in parts):
return True
# Check against English spell checker
if _SPELL_EN:
try:
# Check if word is known in English dictionary
if not _SPELL_EN.unknown([t]):
return True
except Exception:
pass
# Check against French spell checker
if _SPELL_FR:
try:
# Check if word is known in French dictionary
if not _SPELL_FR.unknown([t]):
return True
except Exception:
pass
# Additional checks for common patterns
# Check for common suffixes/prefixes that might not be in dictionaries
common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible']
common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down']
# Check if word with common suffix/prefix is known
for suffix in common_suffixes:
if t.endswith(suffix) and len(t) > len(suffix) + 2:
base_word = t[:-len(suffix)]
if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
return True
for prefix in common_prefixes:
if t.startswith(prefix) and len(t) > len(prefix) + 2:
base_word = t[len(prefix):]
if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
return True
# Check for plural forms (simple 's' ending)
if t.endswith('s') and len(t) > 3:
singular = t[:-1]
if _SPELL_EN and not _SPELL_EN.unknown([singular]):
return True
return False
# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
def normalize_token(token: str) -> str:
toks = _extract_tokens(token)
return (toks[0].lower() if toks else "")
# -------------------- Helpers ----------------------
def _is_pdf(path: str) -> bool:
return os.path.splitext(path.lower())[1] == ".pdf"
def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
"""
Check if a box is in the excluded bottom area (115mm from bottom).
Converts mm to pixels using DPI.
"""
# Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
# Calculate the top boundary of the excluded area
excluded_top = image_height - excluded_height_pixels
# Check if the box intersects with the excluded area
return box.y1 >= excluded_top
def _contains_validation_text(text: str) -> bool:
"""Check if text contains the validation text '50 Carroll'"""
return "50 Carroll" in text
def load_pdf_pages(path: str, dpi: int = 600, max_pages: int = 15) -> List[Image.Image]:
"""Load PDF pages as images with fallback options"""
if not _is_pdf(path):
return [Image.open(path).convert("RGB")]
# Try pdf2image first
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
for poppler_path in poppler_paths:
try:
if poppler_path:
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
else:
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
if imgs:
return [img.convert("RGB") for img in imgs]
except Exception:
if poppler_path is None: # All pdf2image attempts failed
break
continue # Try next path
# Fallback to PyMuPDF
if HAS_PYMUPDF:
try:
doc = fitz.open(path)
pages = []
for page_num in range(min(len(doc), max_pages)):
page = doc[page_num]
mat = fitz.Matrix(dpi/72, dpi/72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("ppm")
img = Image.open(io.BytesIO(img_data))
pages.append(img.convert("RGB"))
doc.close()
return pages
except Exception as e:
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}")
raise ValueError("Failed to convert PDF to image. No working method available.")
def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
"""Combine multiple pages into a single vertical image"""
if not pages:
raise ValueError("No pages to combine")
if len(pages) == 1:
return pages[0]
# Find the maximum width
max_width = max(page.width for page in pages)
# Calculate total height
total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
# Create combined image
combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
y_offset = 0
for page in pages:
# Center the page horizontally if it's narrower than max_width
x_offset = (max_width - page.width) // 2
combined.paste(page, (x_offset, y_offset))
y_offset += page.height + spacing
return combined
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
if a.size == b.size:
return a, b
w, h = min(a.width, b.width), min(a.height, b.height)
return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))
def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
return ImageChops.difference(a, b)
def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
arr = np.asarray(diff_img).astype(np.uint16)
gray = arr.max(axis=2).astype(np.uint8)
mask = (gray >= threshold).astype(np.uint8)
mask = dilation(mask, rectangle(3, 3))
labeled = label(mask, connectivity=2)
out: List[Box] = []
img_height = diff_img.height
for p in regionprops(labeled):
if p.area < min_area:
continue
minr, minc, maxr, maxc = p.bbox
box = Box(minr, minc, maxr, maxc, int(p.area))
# Skip boxes in the excluded bottom area
if _is_in_excluded_bottom_area(box, img_height):
continue
out.append(box)
return out
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
width: int = 3) -> Image.Image:
out = img.copy(); d = ImageDraw.Draw(out)
# red (diff)
for b in red_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
# cyan (misspellings)
for b in cyan_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
# green (barcodes)
if green_boxes:
for b in green_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
return out
def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
A = np.asarray(a).copy(); B = np.asarray(b)
mask = np.any(A != B, axis=2)
A[mask] = [255, 0, 0]
return Image.fromarray(A)
# -------------------- OCR + Spellcheck -------------
from typing import List, Iterable, Optional
from PIL import Image
import unicodedata
import regex as re
import pytesseract
try:
from spellchecker import SpellChecker
except ImportError:
try:
from pyspellchecker import SpellChecker
except ImportError:
SpellChecker = None
# If these existed in your file, keep them; otherwise define defaults to avoid NameError
try:
HAS_OCR
except NameError:
HAS_OCR = True
try:
HAS_SPELLCHECK
except NameError:
HAS_SPELLCHECK = True
# ---- spell/tokenization helpers & caches ----
_WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)
_SPELL_EN = SpellChecker(language="en")
_SPELL_FR = SpellChecker(language="fr")
_DOMAIN_ALLOWLIST = {
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
}
_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
def _normalize_text(s: str) -> str:
s = unicodedata.normalize("NFC", s)
return s.replace("’", "'").strip()
def _extract_tokens(raw: str):
s = _normalize_text(raw or "")
return _WORD_RE.findall(s)
def _looks_like_acronym(tok: str) -> bool:
return tok.isupper() and 2 <= len(tok) <= 6
def _has_digits(tok: str) -> bool:
return any(ch.isdigit() for ch in tok)
# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
def normalize_token(token: str) -> str:
toks = _extract_tokens(token)
return (toks[0].lower() if toks else "")
def _get_available_tesseract_langs():
"""Get available Tesseract languages"""
try:
langs = pytesseract.get_languages()
if 'eng' in langs and 'fra' in langs:
return "eng+fra"
elif 'eng' in langs:
return "eng"
elif langs:
return langs[0]
else:
return "eng"
except Exception:
return "eng"
def prepare_for_ocr(img: Image.Image) -> Image.Image:
"""Prepare image for better OCR results"""
from PIL import ImageOps, ImageFilter
g = img.convert("L")
g = ImageOps.autocontrast(g)
g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
return g
def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
"""Extract text directly from PDF using PyMuPDF"""
if not HAS_PYMUPDF:
return []
try:
doc = fitz.open(path)
texts = []
for page_num in range(min(len(doc), max_pages)):
page = doc[page_num]
text = page.get_text()
texts.append(text)
doc.close()
return texts
except Exception:
return []
def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
"""Convert PDF coordinates to image coordinates"""
pdf_width, pdf_height = pdf_page_size
img_width, img_height = image_size
# Scale factors
scale_x = img_width / pdf_width
scale_y = img_height / pdf_height
# Convert PDF coordinates to image coordinates
x1 = int(pdf_bbox[0] * scale_x)
y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
x2 = int(pdf_bbox[2] * scale_x)
y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
return x1, y1, x2, y2
def find_misspell_boxes_from_text(
pdf_path: str,
*,
extra_allow: Optional[Iterable[str]] = None,
max_pages: int = 5,
image_size: Optional[Tuple[int, int]] = None
) -> List[Box]:
"""Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
if not (HAS_SPELLCHECK and HAS_PYMUPDF):
return []
# Load extra allowed words
if extra_allow and _SPELL_EN:
_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
if extra_allow and _SPELL_FR:
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
boxes: List[Box] = []
try:
doc = fitz.open(pdf_path)
for page_num in range(min(len(doc), max_pages)):
page = doc[page_num]
# Get text with position information
text_dict = page.get_text("dict")
# Process each block of text
for block in text_dict.get("blocks", []):
if "lines" not in block:
continue
for line in block["lines"]:
for span in line["spans"]:
text = span.get("text", "").strip()
if not text:
continue
# Extract tokens and check for misspellings
tokens = _extract_tokens(text)
has_misspelling = False
for token in tokens:
if len(token) >= 2 and not _is_known_word(token):
has_misspelling = True
break
# If this span has misspellings, create a box for it
if has_misspelling:
bbox = span["bbox"] # [x0, y0, x1, y1]
# Get page dimensions for coordinate conversion
page_rect = page.rect
pdf_width = page_rect.width
pdf_height = page_rect.height
# Calculate coordinates
if image_size:
img_width, img_height = image_size
# Convert PDF coordinates to image coordinates
scale_x = img_width / pdf_width
scale_y = img_height / pdf_height
x1 = int(bbox[0] * scale_x)
y1 = int(bbox[1] * scale_y) + (page_num * img_height)
x2 = int(bbox[2] * scale_x)
y2 = int(bbox[3] * scale_y) + (page_num * img_height)
else:
x1 = int(bbox[0])
y1 = int(bbox[1]) + (page_num * 1000)
x2 = int(bbox[2])
y2 = int(bbox[3]) + (page_num * 1000)
# Create box
box = Box(y1=y1, x1=x1, y2=y2, x2=x2, area=(x2 - x1) * (y2 - y1))
# Skip boxes in excluded bottom area
if image_size:
img_height = image_size[1]
if _is_in_excluded_bottom_area(box, img_height):
continue
else:
if _is_in_excluded_bottom_area(box, ph):
continue
boxes.append(box)
doc.close()
except Exception:
# Fallback to simple text extraction if coordinate mapping fails
page_texts = extract_pdf_text(pdf_path, max_pages)
for page_num, text in enumerate(page_texts):
if not text.strip():
continue
tokens = _extract_tokens(text)
misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]
if misspelled_words:
# Create a placeholder box for the page
placeholder_box = Box(
y1=page_num * 1000,
x1=0,
y2=(page_num + 1) * 1000,
x2=800,
area=800 * 1000
)
# Skip if the placeholder box is in the excluded bottom area
if image_size:
img_height = image_size[1]
if _is_in_excluded_bottom_area(placeholder_box, img_height):
continue
else:
if _is_in_excluded_bottom_area(placeholder_box, 1000):
continue
boxes.append(placeholder_box)
return boxes
def find_misspell_boxes(
img: Image.Image,
*,
min_conf: int = 60,
lang: Optional[str] = None,
extra_allow: Optional[Iterable[str]] = None,
dpi: int = 300,
psm: int = 6,
oem: int = 3
) -> List[Box]:
"""Legacy OCR-based spell checking (kept for fallback)"""
if not (HAS_OCR and HAS_SPELLCHECK):
return []
# Auto-detect language if not provided
if lang is None:
try:
avail = set(pytesseract.get_languages(config="") or [])
except Exception:
avail = {"eng"}
lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
# OPTIONAL: light upscale if the image is small (heuristic)
# target width ~ 2500–3000 px for letter-sized pages
if img.width < 1600:
scale = 2
img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
# Prepare image for better OCR
img = prepare_for_ocr(img)
try:
if extra_allow and _SPELL_EN:
_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
if extra_allow and _SPELL_FR:
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
# Build a config that sets an explicit DPI and keeps spaces
config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
data = pytesseract.image_to_data(
img,
lang=lang,
config=config,
output_type=pytesseract.Output.DICT,
)
except Exception:
return []
n = len(data.get("text", [])) or 0
boxes: List[Box] = []
for i in range(n):
raw = data["text"][i]
if not raw:
continue
# confidence filter
conf_str = data.get("conf", ["-1"])[i]
try:
conf = int(float(conf_str))
except Exception:
conf = -1
if conf < min_conf:
continue
tokens = _extract_tokens(raw)
if not tokens:
continue
# flag the box if ANY token in it looks misspelled
if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
continue
left = data.get("left", [0])[i]
top = data.get("top", [0])[i]
width = data.get("width", [0])[i]
height = data.get("height",[0])[i]
if width <= 0 or height <= 0:
continue
# NOTE: adjust to match your Box constructor if needed
b = Box(top, left, top + height, left + width, width * height)
# Exclude bottom 115mm
if _is_in_excluded_bottom_area(b, img.height):
continue
boxes.append(b)
return boxes
# deps: pip install zxing-cpp pyzbar pylibdmtx PyMuPDF pillow opencv-python-headless regex
# system: macOS -> brew install zbar poppler ; Ubuntu -> sudo apt-get install libzbar0 poppler-utils
import io, regex as re
from typing import List, Tuple, Dict, Any
from PIL import Image, ImageOps
import numpy as np
import fitz # PyMuPDF
# Optional backends
try:
import zxingcpp; HAS_ZXING=True
except Exception: HAS_ZXING=False
def _zxing_hints_all():
if not HAS_ZXING:
return None
hints = zxingcpp.DecodeHints()
# Work harder + allow rotated orientations internally (keeps coords correct)
try: hints.try_harder = True
except Exception: pass
try: hints.try_rotate = True
except Exception: pass
# GS1 interpretation (FNC1)
try: hints.is_gs1 = True
except Exception: pass
# Enable as many formats as the wrapper exposes (covers GS1 DataBar incl. stacked/expanded)
BF = getattr(zxingcpp, "BarcodeFormat", None)
mask = 0
for nm in [
"QR_CODE", "AZTEC", "PDF417", "DATA_MATRIX", "MAXICODE",
"EAN_13", "EAN_8", "UPC_A", "UPC_E",
"CODE_39", "CODE_93", "CODE_128", "ITF", "CODABAR",
"RSS_14", "RSS_EXPANDED", "RSS_LIMITED", # AKA GS1 DataBar family
"GS1_DATABAR", "GS1_DATABAR_EXPANDED", "GS1_DATABAR_LIMITED" # some wheels expose these names
]:
val = getattr(BF, nm, None)
if val is not None:
mask |= int(val)
if mask:
hints.formats = mask
return hints
try:
from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol; HAS_ZBAR=True
except Exception: HAS_ZBAR=False; ZBarSymbol=None
try:
from pylibdmtx.pylibdmtx import decode as dmtx_decode; HAS_DMTX=True
except Exception: HAS_DMTX=False
try:
import cv2; HAS_CV2=True
except Exception: HAS_CV2=False
# Consider barcode capability present if ANY backend is available
HAS_ANY_BARCODE = any([locals().get("HAS_ZXING", False),
locals().get("HAS_ZBAR", False),
locals().get("HAS_DMTX", False),
locals().get("HAS_CV2", False)])
# your Box(y1,x1,y2,x2,area) assumed to exist
def _binarize(img: Image.Image) -> Image.Image:
g = ImageOps.grayscale(img)
g = ImageOps.autocontrast(g)
return g.point(lambda x: 255 if x > 140 else 0, mode="1").convert("L")
def _ean_checksum_ok(d: str) -> bool:
if not d.isdigit(): return False
n=len(d); nums=list(map(int,d))
if n==8:
return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(7))%10))%10==nums[7]
if n==12:
return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(11))%10))%10==nums[11]
if n==13:
return (10 - (sum(nums[i]*(1 if i%2==0 else 3) for i in range(12))%10))%10==nums[12]
return True
def _normalize_upc_ean(sym: str, text: str):
digits = re.sub(r"\D","",text or "")
s = (sym or "").upper()
if s in ("EAN13","EAN-13") and len(digits)==13 and digits.startswith("0"):
return "UPCA", digits[1:]
return s, (digits if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else text or "")
def _validate(sym: str, payload: str) -> bool:
s, norm = _normalize_upc_ean(sym, payload)
return _ean_checksum_ok(norm) if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else bool(payload)
def parse_gs1(text: str) -> Optional[dict]:
if not text: return None
# ZXing returns FNC1 as ASCII 29 (\x1D) for GS1-128/QR/DM
s = text.replace("\x1D", ")(") # visual separator
# Very lightweight AI parser for common AIs; extend as needed
import re as _re
ai_pat = _re.compile(r"\((\d{2,4})\)([^()]+)")
out = {}
for m in ai_pat.finditer(s):
ai, val = m.group(1), m.group(2)
out[ai] = val
return out or None
def _decode_zxing_all(pil: Image.Image) -> List[Dict[str, Any]]:
if not HAS_ZXING:
return []
arr = np.asarray(pil.convert("L"))
# Try to use ReaderOptions if available (newer zxing-cpp)
ReaderOptions = getattr(zxingcpp, "ReaderOptions", None)
BarcodeFormat = getattr(zxingcpp, "BarcodeFormat", None)
results = []
try:
if ReaderOptions and BarcodeFormat:
opts = ReaderOptions()
# Enable wide coverage including GS1/stacked-capable formats
opts.formats = (
BarcodeFormat.QR_CODE | getattr(BarcodeFormat, "MICRO_QR", 0) |
BarcodeFormat.DATA_MATRIX |
BarcodeFormat.PDF417 | # stacked rows
BarcodeFormat.AZTEC |
BarcodeFormat.MAXICODE |
BarcodeFormat.EAN_13 | BarcodeFormat.EAN_8 | BarcodeFormat.UPC_A | getattr(BarcodeFormat, "UPC_E", 0) |
BarcodeFormat.CODE_128 | BarcodeFormat.CODE_39 | getattr(BarcodeFormat, "CODE_93", 0) |
BarcodeFormat.ITF | BarcodeFormat.CODABAR |
getattr(BarcodeFormat, "RSS_14", 0) | getattr(BarcodeFormat, "RSS_EXPANDED", 0) # GS1 DataBar
)
opts.try_harder = True
opts.try_rotate = True
# read_barcodes accepts numpy array + options
zx = zxingcpp.read_barcodes(arr, opts)
else:
# Older binding: falls back to default behavior
zx = zxingcpp.read_barcodes(arr)
for r in zx or []:
x1=y1=w=h=0
pos = getattr(r, "position", None)
pts=[]
if pos is not None:
try:
pts=list(pos)
except TypeError:
for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
"point1","point2","point3","point4"):
if hasattr(pos, name):
p=getattr(pos,name)
if hasattr(p,"x") and hasattr(p,"y"):
pts.append(p)
if pts:
xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
results.append({
"type": str(getattr(r,"format", "")),
"data": getattr(r,"text","") or "",
"left": x1, "top": y1, "width": w, "height": h
})
except Exception:
return []
return results
def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
if not HAS_ZBAR:
return []
try:
# Add more 1D formats ZBar supports
syms = []
for nm in ("QRCODE","EAN13","EAN8","UPCA","UPCE","CODE128","CODE39","I25","CODABAR"):
if hasattr(ZBarSymbol, nm):
syms.append(getattr(ZBarSymbol, nm))
res = zbar_decode(pil, symbols=syms) if syms else zbar_decode(pil)
out=[]
for d in res:
data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
out.append({
"type": d.type, "data": data,
"left": d.rect.left, "top": d.rect.top,
"width": d.rect.width, "height": d.rect.height
})
return out
except Exception:
return []
def _decode_dmtx(pil: Image.Image) -> List[Dict[str,Any]]:
if not HAS_DMTX: return []
try:
res=dmtx_decode(ImageOps.grayscale(pil))
return [{"type":"DATAMATRIX","data": r.data.decode("utf-8","ignore"),
"left": r.rect.left, "top": r.rect.top, "width": r.rect.width, "height": r.rect.height} for r in res]
except Exception:
return []
def _decode_cv2_qr(pil: Image.Image) -> List[Dict[str,Any]]:
if not HAS_CV2: return []
try:
det=cv2.QRCodeDetector()
g=np.asarray(pil.convert("L"))
val, pts, _ = det.detectAndDecode(g)
if val:
if pts is not None and len(pts)>=1:
pts=pts.reshape(-1,2); xs,ys=pts[:,0],pts[:,1]
x1,x2=int(xs.min()),int(xs.max()); y1,y2=int(ys.min()),int(ys.max())
w,h=x2-x1,y2-y1
else:
x1=y1=w=h=0
return [{"type":"QRCODE","data":val,"left":x1,"top":y1,"width":w,"height":h}]
except Exception:
pass
return []
def _dedupe_hits(hits: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
seen=set(); out=[]
for r in hits:
# More aggressive deduplication based on content and approximate location
data = r.get("data", "").strip()
if not data: # Skip empty detections
continue
# Round coords to reduce jitter then dedupe
key=(r.get("type",""), data,
int(round(r.get("left",0)/10)*10), int(round(r.get("top",0)/10)*10))
if key in seen:
continue
seen.add(key)
out.append(r)
return out
def _decode_variants(pil: Image.Image) -> List[Dict[str,Any]]:
# Start with original image only to avoid false positives
variants=[pil]
w,h = pil.size
if max(w,h) < 1600:
up = pil.resize((w*2,h*2), resample=Image.NEAREST)
variants += [up]
hits=[]
for v in variants:
# Only try original orientation to avoid coordinate mapping issues
hits += _decode_zxing_all(v)
hits += _decode_zbar(v)
hits += _decode_dmtx(v)
hits += _decode_cv2_qr(v)
return _dedupe_hits(hits)
def _pix_to_pil(pix) -> Image.Image:
# convert PyMuPDF Pixmap to grayscale PIL without alpha (avoids blur)
if pix.alpha: pix = fitz.Pixmap(pix, 0)
try:
pix = fitz.Pixmap(fitz.csGRAY, pix)
except Exception:
pass
return Image.open(io.BytesIO(pix.tobytes("png")))
def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
"""Return (boxes, infos) from both rendered pages and embedded images."""
boxes=[]; infos=[]
doc=fitz.open(pdf_path)
n=min(len(doc), max_pages)
for page_idx in range(n):
page=doc[page_idx]
# A) Embedded images (often crisp)
for ix,(xref,*_) in enumerate(page.get_images(full=True)):
try:
pix=fitz.Pixmap(doc, xref)
pil=_pix_to_pil(pix)
hits=_decode_variants(pil)
for r in hits:
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
# Exclude barcodes in the bottom 115mm of the page image
if _is_in_excluded_bottom_area(b, pil.height):
continue
boxes.append(b)
sym, payload = r["type"], r["data"]
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
except Exception:
pass
# B) Render page raster at high DPI (grayscale)
for dpi in dpi_list:
scale=dpi/72.0
try:
pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False)
except TypeError:
pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False)
pil=_pix_to_pil(pix)
hits=_decode_variants(pil)
for r in hits:
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
if _is_in_excluded_bottom_area(b, pil.height):
continue
boxes.append(b)
sym, payload = r["type"], r["data"]
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
if any(i["page"]==page_idx+1 for i in infos):
break # found something for this page β†’ next page
doc.close()
return boxes, infos
# -------------------- CMYK Panel -------------------
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255
def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
if y2<=y1 or x2<=x1:
return (0.0,0.0,0.0,0.0)
region = cmyk_arr[y1:y2, x1:x2, :]
mean_vals = region.reshape(-1, 4).mean(axis=0)
return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)
def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
a_cmyk = rgb_to_cmyk_array(a_img)
b_cmyk = rgb_to_cmyk_array(b_img)
entries = []
for i, bx in enumerate(red_boxes):
a_vals = avg_cmyk_in_box(a_cmyk, bx)
b_vals = avg_cmyk_in_box(b_cmyk, bx)
delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
return entries
def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
w,h = base.size
panel = Image.new('RGB', (panel_width, h), (245,245,245))
out = Image.new('RGB', (w+panel_width, h), (255,255,255))
out.paste(base, (0,0)); out.paste(panel, (w,0))
d = ImageDraw.Draw(out)
x0 = w + 8; y = 8
d.text((x0, y), title, fill=(0,0,0)); y += 18
if not entries:
d.text((x0, y), 'No differing regions', fill=(80,80,80))
return out
for e in entries:
idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
if y > h - 40: break
return out
# -------------------- Gradio Interface -----------------
def _contains_50_carroll(pdf_path: str) -> bool:
"""Check if PDF contains the text '50 carroll' (case insensitive)"""
try:
if not HAS_PYMUPDF:
return True # Skip validation if PyMuPDF not available
doc = fitz.open(pdf_path)
for page_num in range(min(len(doc), 5)): # Check first 5 pages
page = doc[page_num]
text = page.get_text().lower()
if "50 carroll" in text:
doc.close()
return True
doc.close()
return False
except Exception:
return True # Skip validation on error
def compare_pdfs(file_a, file_b):
"""Main comparison function for Gradio interface"""
try:
if file_a is None or file_b is None:
return None, None, None, "❌ Please upload both PDF files to compare", [], []
# Check for "50 carroll" text in both files
if not _contains_50_carroll(file_a.name) or not _contains_50_carroll(file_b.name):
return None, None, None, "❌ Invalid File type", [], []
# Load images with multiple pages support
pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
# Combine pages into single images for comparison
a = combine_pages_vertically(pages_a)
b = combine_pages_vertically(pages_b)
# Match sizes
a, b = match_sizes(a, b)
# Find differences with default settings
diff = difference_map(a, b)
red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
# Run all analysis features with defaults
# Use text-based spell checking instead of OCR for better accuracy
# Pass image dimensions for proper coordinate mapping
image_size = (a.width, a.height)
misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
# Debug: Print spell check results
print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
if HAS_ANY_BARCODE:
try:
print(f"Starting barcode detection for file A: {file_a.name}")
bar_a, info_a = find_barcodes_in_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcodes_in_image(a)
print(f"Barcode detection A complete: {len(bar_a)} boxes, {len(info_a)} infos")
print(f"Starting barcode detection for file B: {file_b.name}")
bar_b, info_b = find_barcodes_in_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcodes_in_image(b)
print(f"Barcode detection B complete: {len(bar_b)} boxes, {len(info_b)} infos")
except Exception as e:
print(f"Barcode detection error: {e}")
import traceback
traceback.print_exc()
bar_a, info_a = [], []
bar_b, info_b = [], []
else:
print("No barcode backends available")
bar_a, info_a = [], []
bar_b, info_b = [], []
# Always enable CMYK analysis
cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
# Create visualizations with default box width
a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3)
b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3)
# Always show CMYK panel
a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
# Create pixel difference overlay
overlay = make_red_overlay(a, b)
# Create status message
status = f"""
πŸ“Š **Analysis Complete!**
- **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
- **Difference regions found:** {len(red_boxes)}
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
- **Combined image dimensions:** {a.width} Γ— {a.height} pixels
**Legend:**
- πŸ”΄ Red boxes: Visual differences
- πŸ”΅ Cyan boxes: Spelling errors
- 🟒 Green boxes: Barcodes/QR codes
"""
# Prepare barcode data for tables
codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]
return overlay, a_disp, b_disp, status, codes_a, codes_b
except Exception as e:
error_msg = f"❌ **Error:** {str(e)}"
return None, None, None, error_msg, [], []
# -------------------- Gradio App -------------------
def create_demo():
# Create custom theme with light blue background
# Create a simple, working theme with supported parameters only
custom_theme = gr.themes.Soft(
primary_hue="blue",
neutral_hue="blue",
font=gr.themes.GoogleFont("Inter"),
).set(
body_background_fill="#99cfe9", # Light blue background
body_background_fill_dark="#99cfe9",
block_background_fill="#000000", # Black blocks for contrast
block_background_fill_dark="#000000",
border_color_primary="#333333", # Dark borders
border_color_primary_dark="#333333",
)
with gr.Blocks(title="PDF Comparison Tool", theme=custom_theme) as demo:
gr.Markdown("""
# πŸ” Advanced PDF Comparison Tool
Upload two PDF files to get comprehensive analysis including:
- **Multi-page PDF support** (up to 15 pages per document)
- **Visual differences** with bounding boxes
- **OCR and spell checking**
- **Barcode/QR code detection**
- **CMYK color analysis**
""")
with gr.Row():
with gr.Column():
file_a = gr.File(label="πŸ“„ PDF A (Reference)", file_types=[".pdf"])
file_b = gr.File(label="πŸ“„ PDF B (Comparison)", file_types=[".pdf"])
compare_btn = gr.Button("πŸ” Compare PDF Files", variant="primary", size="lg")
status_md = gr.Markdown("")
with gr.Row():
overlay_img = gr.Image(label="πŸ”΄ Pixel Differences (Red = Different)", type="pil")
with gr.Row():
img_a = gr.Image(label="πŸ“„ File A with Analysis", type="pil")
img_b = gr.Image(label="πŸ“„ File B with Analysis", type="pil")
gr.Markdown("### πŸ“Š Barcode Detection Results")
with gr.Row():
codes_a_df = gr.Dataframe(
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
label="Barcodes in File A",
interactive=False
)
codes_b_df = gr.Dataframe(
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
label="Barcodes in File B",
interactive=False
)
# Event handlers
compare_btn.click(
fn=compare_pdfs,
inputs=[file_a, file_b],
outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
)
gr.Markdown("""
### πŸ“ Instructions:
1. Upload two PDF files
2. Click "Compare PDF Files"
3. View results with comprehensive analysis
### 🎨 Color Legend:
- **πŸ”΄ Red boxes:** Visual differences between files
- **πŸ”΅ Cyan boxes:** Potential spelling errors (OCR)
- **🟒 Green boxes:** Detected barcodes/QR codes
- **πŸ“Š Side panel:** CMYK color analysis for print workflows
""")
return demo
def _binarize(pil_img: Image.Image) -> Image.Image:
"""Create a binarized (black/white) version of the image for better barcode detection"""
g = ImageOps.grayscale(pil_img)
g = ImageOps.autocontrast(g)
return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L')
def _decode_once(img: Image.Image):
"""Single decode attempt with common barcode symbols"""
# Only use pyzbar if available, otherwise rely on ZXing-CPP
syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128]
return zbar_decode(img, symbols=syms)
def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
"""
Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues.
This function:
- Renders pages at 600/900/1200 DPI
- Tries grayscale, binarized, and rotated versions
- Scans embedded images (XObjects)
- Prints what it finds and writes debug PNGs
- Helps identify if barcodes are too thin/low resolution
Usage:
debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2)
"""
if not HAS_PYMUPDF:
print("ERROR: Missing PyMuPDF dependency")
return
os.makedirs(outdir, exist_ok=True)
doc = fitz.open(pdf_path)
for dpi in (600, 900, 1200):
scale = dpi / 72.0
mat = fitz.Matrix(scale, scale)
print(f"\n=== DPI {dpi} ===")
for p in range(min(len(doc), max_pages)):
page = doc[p]
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("ppm")))
img.save(f"{outdir}/page{p+1}_{dpi}.png")
# Try different image variants
variants = [
("orig", img),
("gray", ImageOps.grayscale(img)),
("bin", _binarize(img)),
]
found = []
for tag, v in variants:
r = _decode_once(v)
if r:
found.extend((tag, rr.type, rr.data) for rr in r)
else:
# Try rotations
for angle in (90, 180, 270):
rr = _decode_once(v.rotate(angle, expand=True))
if rr:
found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr)
break
print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}")
# Scan embedded images too
imgs = page.get_images(full=True)
for ix, (xref, *_) in enumerate(imgs):
try:
ipix = fitz.Pixmap(doc, xref)
if ipix.alpha:
ipix = fitz.Pixmap(ipix, 0)
pil = Image.open(io.BytesIO(ipix.tobytes("ppm")))
pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png")
rr = _decode_once(pil) or _decode_once(_binarize(pil))
if rr:
print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
except Exception as e:
print(" Embedded image error:", e)
doc.close()
print(f"\nDebug images saved to: {outdir}/")
print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
def find_barcodes_in_pdf(pdf_path: str, image_size: Optional[Tuple[int,int]]=None, max_pages: int = 10):
boxes: List[Box] = []; infos: List[Dict[str,Any]]=[]
try:
doc = fitz.open(pdf_path)
n = min(len(doc), max_pages)
y_offset = 0
target_width = int(image_size[0]) if image_size else None
for page_idx in range(n):
page = doc[page_idx]
if target_width:
scale = max(1.0, float(target_width)/float(page.rect.width))
else:
scale = 600.0/72.0
try:
pix = page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False)
except TypeError:
pix = page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False)
pil = _pix_to_pil(pix)
# 1) embedded XObjects (often crisp)
for ix,(xref,*_) in enumerate(page.get_images(full=True)):
try:
epix = fitz.Pixmap(doc, xref)
epil = _pix_to_pil(epix)
for r in _decode_variants(epil):
# Check if barcode is in the excluded bottom 115mm area
per_page_box = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
effective_dpi = int(round(72.0 * scale))
if _is_in_excluded_bottom_area(per_page_box, pil.size[1], excluded_height_mm=115.0, dpi=effective_dpi):
continue
b = Box(r["top"]+y_offset, r["left"], r["top"]+y_offset+r["height"], r["left"]+r["width"], r["width"]*r["height"])
boxes.append(b)
infos.append({**r, "valid": _validate(r.get("type",""), r.get("data","")), "page": page_idx+1, "source": f"embed:{ix+1}"})
except Exception:
pass
# 2) page raster
for r in _decode_variants(pil):
# Check if barcode is in the excluded bottom 115mm area
per_page_box = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
effective_dpi = int(round(72.0 * scale))
if _is_in_excluded_bottom_area(per_page_box, pil.size[1], excluded_height_mm=115.0, dpi=effective_dpi):
continue
b = Box(r["top"]+y_offset, r["left"], r["top"]+y_offset+r["height"], r["left"]+r["width"], r["width"]*r["height"])
boxes.append(b)
infos.append({**r, "valid": _validate(r.get("type",""), r.get("data","")), "page": page_idx+1, "source": f"page@scale{scale:.2f}"})
y_offset += pil.size[1]
doc.close()
except Exception:
return [], []
return boxes, infos
def find_barcodes_in_image(pil: Image.Image):
boxes: List[Box] = []; infos: List[Dict[str,Any]]=[]
for r in _decode_variants(pil):
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
boxes.append(b)
infos.append({**r, "valid": _validate(r.get("type",""), r.get("data","")), "page": 1, "source": "image"})
return boxes, infos
def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
"""Detect barcodes from the original PDF and return boxes in the same
coordinate space as the combined display image.
If image_size is provided (w,h of the vertically combined display image),
each page is rendered so its width matches w, then decoded. Box y-coordinates
are offset by the cumulative height of previous pages so that all boxes map
into the combined image space correctly.
"""
boxes: List[Box] = []
infos: List[Dict[str, Any]] = []
try:
doc = fitz.open(pdf_path)
num_pages = min(len(doc), max_pages)
if num_pages == 0:
return [], []
target_width = None
if image_size:
target_width = int(image_size[0])
y_offset = 0
for page_idx in range(num_pages):
page = doc[page_idx]
# Compute scale so that rendered width matches target_width when provided
if target_width:
page_width_pts = float(page.rect.width) # 72 dpi units
scale = max(1.0, target_width / page_width_pts)
else:
scale = 600.0 / 72.0 # ~600 dpi default
try:
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False)
except TypeError:
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
pil = _pix_to_pil(pix)
pw, ph = pil.size
effective_dpi = 72.0 * scale # <-- this is the real DPI for this rendered page
hits = _decode_variants(pil)
for r in hits:
x1 = int(r.get("left", 0))
y1 = int(r.get("top", 0))
w = int(r.get("width", 0))
h = int(r.get("height", 0))
x2 = x1 + w
y2 = y1 + h
# Per-page box (before stacking)
per_page_box = Box(y1, x1, y2, x2, w*h)
# Exclude the bottom 115mm of THIS PAGE using the correct DPI
if _is_in_excluded_bottom_area(per_page_box, ph, excluded_height_mm=115.0, dpi=int(effective_dpi)):
continue
# Map to combined image by adding the current page's y-offset
combined_box = Box(y1 + y_offset, x1, y2 + y_offset, x2, w*h)
boxes.append(combined_box)
sym, payload = r.get("type",""), r.get("data","")
infos.append({
**r,
"valid": _validate(sym, payload),
"page": page_idx + 1,
"source": f"page@dpi{int(effective_dpi)}"
})
# Add GS1 parsing if available
gs1 = parse_gs1(payload)
if gs1: infos[-1]["gs1"] = gs1
y_offset += ph
doc.close()
except Exception:
return [], []
return boxes, infos
if __name__ == "__main__":
demo = create_demo()
demo.launch(
server_name="0.0.0.0", # Allow external access
share=True, # Set to True to create a public link
show_error=True
)