Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Gradio PDF Comparison Tool | |
| Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. | |
| """ | |
| import os, sys, re, csv, json, io | |
| from dataclasses import dataclass | |
| from typing import List, Tuple, Optional, Iterable | |
| import tempfile | |
| import unicodedata | |
| import numpy as np | |
| from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError | |
| from pdf2image import convert_from_path | |
| from skimage.measure import label, regionprops | |
| from skimage.morphology import dilation, rectangle | |
| import gradio as gr | |
| # Alternative PDF processing | |
| try: | |
| import fitz # PyMuPDF | |
| HAS_PYMUPDF = True | |
| except Exception: | |
| fitz = None | |
| HAS_PYMUPDF = False | |
| # Optional features | |
| try: | |
| import pytesseract | |
| HAS_OCR = True | |
| except Exception: | |
| pytesseract = None | |
| HAS_OCR = False | |
| try: | |
| from spellchecker import SpellChecker | |
| HAS_SPELLCHECK = True | |
| except Exception: | |
| try: | |
| from pyspellchecker import SpellChecker | |
| HAS_SPELLCHECK = True | |
| except Exception: | |
| SpellChecker = None | |
| HAS_SPELLCHECK = False | |
| try: | |
| import regex as re | |
| HAS_REGEX = True | |
| except Exception: | |
| import re | |
| HAS_REGEX = False | |
| try: | |
| from barcode_reader import read_barcodes_from_path | |
| HAS_BARCODE = True | |
| print("β Barcode reader imported successfully") | |
| except Exception as e: | |
| read_barcodes_from_path = None | |
| HAS_BARCODE = False | |
| print(f"β Barcode reader import failed: {e}") | |
| # Enable barcode detection if we have ZXing-CPP or pyzbar | |
| if 'HAS_ZXING' in globals() and HAS_ZXING: | |
| HAS_BARCODE = True | |
| print("β Barcode detection enabled via ZXing-CPP") | |
| elif HAS_BARCODE: | |
| print("β Barcode detection enabled via pyzbar") | |
| else: | |
| print("β No barcode detection available") | |
| # -------------------- Core Data -------------------- | |
| class Box: | |
| y1: int; x1: int; y2: int; x2: int; area: int | |
| # ---- spell/tokenization helpers & caches ---- | |
| if HAS_REGEX: | |
| # Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces | |
| _WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE) | |
| else: | |
| # Fallback regex for basic ASCII | |
| _WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b") | |
| if HAS_SPELLCHECK: | |
| # Initialize English spell checker with comprehensive dictionary | |
| _SPELL_EN = SpellChecker(language="en") | |
| # Try to initialize French spell checker with fallback | |
| _SPELL_FR = None | |
| try: | |
| _SPELL_FR = SpellChecker(language="fr") | |
| except Exception: | |
| # If French dictionary fails, try alternative approach | |
| try: | |
| _SPELL_FR = SpellChecker() | |
| # Load some basic French words manually if needed | |
| except Exception: | |
| _SPELL_FR = None | |
| print("Warning: French spell checker not available") | |
| else: | |
| _SPELL_EN = None | |
| _SPELL_FR = None | |
| _DOMAIN_ALLOWLIST = { | |
| # Company/Brand names | |
| "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", | |
| "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid", | |
| # Technical terms | |
| "CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP", | |
| "Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure", | |
| "Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock", | |
| # Common abbreviations | |
| "Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd", | |
| "USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU", | |
| # French words (common in Canadian context) | |
| "QuΓ©bec", "MontrΓ©al", "Toronto", "Vancouver", "Ottawa", "Calgary", | |
| "franΓ§ais", "franΓ§aise", "anglais", "anglaise", "bilingue", | |
| # Common business terms | |
| "Marketing", "Sales", "Customer", "Service", "Quality", "Control", | |
| "Management", "Administration", "Production", "Manufacturing", | |
| "Distribution", "Logistics", "Supply", "Chain", "Inventory", | |
| # Common words that might be flagged | |
| "Email", "Website", "Online", "Internet", "Software", "Hardware", | |
| "Database", "System", "Network", "Server", "Client", "User", | |
| "Password", "Login", "Logout", "Account", "Profile", "Settings", | |
| "Configuration", "Installation", "Maintenance", "Support", | |
| # Numbers and measurements | |
| "mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in", | |
| "x", "by", "times", "multiply", "divide", "plus", "minus", | |
| # Common misspellings that are actually correct in context | |
| "colour", "colour", "favour", "favour", "honour", "honour", | |
| "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre", | |
| "theatre", "theatre", "metre", "metre", "litre", "litre", | |
| # Cannabis and cannabinoid terms (English) | |
| "cannabis", "cannabinoid", "cannabinoids", "cannabidiol", "cbd", "thc", "tetrahydrocannabinol", | |
| "cannabigerol", "cbg", "cannabinol", "cbn", "cannabichromene", "cbc", "cannabicyclol", "cbl", | |
| "cannabielsoin", "cbe", "cannabitriol", "cbt", "cannabivarin", "cbv", "cannabidivarin", "cbdv", | |
| "tetrahydrocannabivarin", "thcv", "cannabigerovarin", "cbgv", "cannabichromevarin", "cbcv", | |
| "cannabidiolic", "cbda", "tetrahydrocannabinolic", "thca", "cannabigerolic", "cbga", | |
| "cannabichromenic", "cbca", "cannabicyclolic", "cbla", "cannabielsoic", "cbea", | |
| "cannabitriolic", "cbta", "cannabivarinic", "cbva", "cannabidivarinic", "cbdva", | |
| "tetrahydrocannabivarinic", "thcva", "cannabigerovarinic", "cbgva", "cannabichromevarinic", "cbcva", | |
| "terpenes", "terpenoids", "myrcene", "limonene", "pinene", "linalool", "humulene", "caryophyllene", | |
| "terpinolene", "ocimene", "nerolidol", "bisabolol", "eucalyptol", "camphene", "sabinene", | |
| "phytocannabinoids", "endocannabinoids", "anandamide", "arachidonoylglycerol", "2ag", | |
| "cannabinoid", "receptor", "cb1", "cb2", "trpv1", "gpr55", "ppar", "serotonin", "dopamine", | |
| "indica", "sativa", "ruderalis", "hybrid", "hemp", "marijuana", "hashish", "kief", "rosin", | |
| "distillate", "isolate", "full", "spectrum", "broad", "entourage", "effect", "bioavailability", | |
| # Cannabis terms (French) | |
| "cannabis", "cannabinoΓ―de", "cannabinoΓ―des", "cannabidiol", "tΓ©trahydrocannabinol", | |
| "cannabigérol", "cannabinol", "cannabichromène", "cannabicyclol", "cannabielsoin", | |
| "cannabitriol", "cannabivarine", "cannabidivarine", "tΓ©trahydrocannabivarine", | |
| "cannabigΓ©rovarine", "cannabichromΓ©varine", "cannabidiolique", "tΓ©trahydrocannabinolique", | |
| "cannabigΓ©rolique", "cannabichromΓ©nique", "cannabicyclolique", "cannabielsoΓ―que", | |
| "cannabitriolique", "cannabivarinique", "cannabidivarinique", "tΓ©trahydrocannabivarinique", | |
| "cannabigérovarinique", "cannabichromévarinique", "terpènes", "terpénoïdes", "myrcène", | |
| "limonène", "pinène", "linalol", "humulène", "caryophyllène", "terpinolène", "ocimène", | |
| "nérolidol", "bisabolol", "eucalyptol", "camphène", "sabinène", "phytocannabinoïdes", | |
| "endocannabinoΓ―des", "anandamide", "arachidonoylglycΓ©rol", "rΓ©cepteur", "sΓ©rotonine", | |
| "dopamine", "chanvre", "marijuana", "haschisch", "kief", "rosin", "distillat", "isolat", | |
| "spectre", "complet", "large", "effet", "d'entourage", "biodisponibilitΓ©", | |
| # Common pharmaceutical ingredients and terms | |
| "glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate", "ascorbic", "ascorbate", | |
| "retinol", "retinyl", "palmitate", "stearate", "oleate", "linoleate", "arachidonate", | |
| "docosahexaenoate", "eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega", | |
| "hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl", "heptyl", "octyl", | |
| "nonyl", "decyl", "phenyl", "benzyl", "allyl", "vinyl", "acetyl", "benzoyl", "formyl", | |
| "carboxyl", "carbonyl", "hydroxyl", "amino", "imino", "nitro", "nitroso", "azo", | |
| "phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide", "fluoride", | |
| "sodium", "potassium", "calcium", "magnesium", "zinc", "iron", "copper", "manganese", | |
| "selenium", "chromium", "molybdenum", "cobalt", "nickel", "vanadium", "tungsten", | |
| "thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine", "biotin", "folate", | |
| "cobalamin", "cholecalciferol", "ergocalciferol", "phylloquinone", "menaquinone", | |
| "ubiquinone", "coenzyme", "carnitine", "creatine", "taurine", "glutamine", "arginine", | |
| "lysine", "leucine", "isoleucine", "valine", "phenylalanine", "tryptophan", "methionine", | |
| "cysteine", "tyrosine", "histidine", "proline", "serine", "threonine", "asparagine", | |
| "glutamic", "aspartic", "alanine", "glycine", "ornithine", "citrulline", "taurine", | |
| "polysorbate", "monostearate", "distearate", "tristearate", "polyethylene", "polypropylene", | |
| "polyvinyl", "carbomer", "carboxymethyl", "cellulose", "hydroxypropyl", "methylcellulose", | |
| "ethylcellulose", "microcrystalline", "lactose", "sucrose", "dextrose", "fructose", | |
| "maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol", "stearic", | |
| "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic", "eicosapentaenoic", | |
| "linolenic", "conjugated", "acid", "ester", "amide", "anhydride", "hydrochloride", | |
| "hydrobromide", "hydroiodide", "citrate", "tartrate", "succinate", "fumarate", "malate", | |
| "lactate", "gluconate", "pamoate", "mesylate", "tosylate", "besylate", "edisylate", | |
| "estolate", "stearate", "palmitate", "oleate", "linoleate", "arachidonate", "butyrate", | |
| "valerate", "caproate", "caprylate", "caprate", "laurate", "myristate", "palmitoleate", | |
| "vaccenate", "gadoleate", "erucate", "nervonate", "lignocerate", "cerotate", "montanate", | |
| "melissate", "laccerate", "psyllate", "juniperate", "sabinate", "abietate", "pimarate", | |
| "sandaracopimarate", "isopimarate", "levopimarate", "dehydroabietate", "neoabietate", | |
| "palustrate", "pimarenate", "sandaracopimarenate", "isopimarenate", "levopimarenate", | |
| "dehydroabietate", "neoabietate", "palustrate", "pimarenate", "sandaracopimarenate", | |
| "isopimarenate", "levopimarenate", "dehydroabietate", "neoabietate", "palustrate", | |
| # Common pharmaceutical excipients and additives | |
| "stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic", | |
| "eicosapentaenoic", "linolenic", "conjugated", "linoleic", "acid", "ester", "amide", | |
| "anhydride", "hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate", | |
| "phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate", "malate", | |
| "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate", "palmitate", "stearate", | |
| "oleate", "linoleate", "arachidonate", "butyrate", "valerate", "caproate", "caprylate", | |
| "caprate", "laurate", "myristate", "palmitoleate", "vaccenate", "gadoleate", "erucate", | |
| "nervonate", "lignocerate", "cerotate", "montanate", "melissate", "laccerate", "psyllate", | |
| "juniperate", "sabinate", "abietate", "pimarate", "sandaracopimarate", "isopimarate", | |
| "levopimarate", "dehydroabietate", "neoabietate", "palustrate", "pimarenate", | |
| "sandaracopimarenate", "isopimarenate", "levopimarenate", "dehydroabietate", "neoabietate", | |
| "palustrate", "pimarenate", "sandaracopimarenate", "isopimarenate", "levopimarenate", | |
| "dehydroabietate", "neoabietate", "palustrate", "pimarenate", "sandaracopimarenate", | |
| "isopimarenate", "levopimarenate", "dehydroabietate", "neoabietate", "palustrate", | |
| # Common pharmaceutical terms and abbreviations | |
| "mg", "mcg", "iu", "units", "tablets", "capsules", "softgels", "gummies", "tinctures", | |
| "oils", "creams", "lotions", "gels", "patches", "injections", "syrups", "suspensions", | |
| "emulsions", "solutions", "powders", "granules", "pellets", "beads", "microspheres", | |
| "liposomes", "nanoparticles", "micelles", "vesicles", "cyclodextrins", "dendrimers", | |
| "bioavailability", "pharmacokinetics", "pharmacodynamics", "metabolism", "elimination", | |
| "half", "life", "clearance", "volume", "distribution", "protein", "binding", "first", | |
| "pass", "effect", "cytochrome", "p450", "cyp", "enzymes", "inducers", "inhibitors", | |
| "substrates", "metabolites", "prodrugs", "active", "metabolites", "inactive", "metabolites", | |
| "therapeutic", "index", "margin", "safety", "efficacy", "potency", "affinity", "selectivity", | |
| "specificity", "receptor", "binding", "agonists", "antagonists", "partial", "agonists", | |
| "inverse", "agonists", "allosteric", "modulators", "positive", "allosteric", "modulators", | |
| "negative", "allosteric", "modulators", "competitive", "antagonists", "non", "competitive", | |
| "antagonists", "irreversible", "antagonists", "reversible", "antagonists", "uncompetitive", | |
| "antagonists", "mixed", "antagonists", "surmountable", "antagonists", "insurmountable", | |
| "antagonists", "dose", "response", "curves", "ec50", "ic50", "ed50", "ld50", "td50", | |
| "noael", "loael", "adme", "absorption", "distribution", "metabolism", "excretion", | |
| "elimination", "clearance", "renal", "clearance", "hepatic", "clearance", "total", | |
| "clearance", "systemic", "clearance", "apparent", "clearance", "intrinsic", "clearance", | |
| "hepatic", "extraction", "ratio", "first", "pass", "metabolism", "presystemic", | |
| "metabolism", "gut", "wall", "metabolism", "liver", "metabolism", "plasma", "protein", | |
| "binding", "albumin", "binding", "alpha", "acid", "glycoprotein", "binding", "globulin", | |
| "binding", "lipoprotein", "binding", "erythrocyte", "binding", "tissue", "binding", | |
| "partition", "coefficient", "log", "p", "octanol", "water", "partition", "coefficient", | |
| "membrane", "permeability", "intestinal", "permeability", "blood", "brain", "barrier", | |
| "permeability", "placental", "permeability", "milk", "permeability", "skin", "permeability", | |
| "corneal", "permeability", "nasal", "permeability", "pulmonary", "permeability", | |
| "buccal", "permeability", "sublingual", "permeability", "rectal", "permeability", | |
| "vaginal", "permeability", "transdermal", "permeability", "iontophoretic", "permeability", | |
| "electroporation", "permeability", "sonophoresis", "permeability", "microneedle", | |
| "permeability", "nanoparticle", "permeability", "liposome", "permeability", "micelle", | |
| "permeability", "cyclodextrin", "permeability", "dendrimer", "permeability", "vesicle", | |
| "permeability", "nanocapsule", "permeability", "nanosphere", "permeability", "microsphere", | |
| "permeability", "microcapsule", "permeability", "nanocrystal", "permeability", "nanosuspension", | |
| "permeability", "microemulsion", "permeability", "nanoemulsion", "permeability", "solid", | |
| "lipid", "nanoparticle", "permeability", "nanostructured", "lipid", "carrier", "permeability", | |
| "self", "microemulsifying", "drug", "delivery", "system", "permeability", "self", | |
| "nanoemulsifying", "drug", "delivery", "system", "permeability", "liquid", "crystalline", | |
| "nanoparticle", "permeability", "cubosome", "permeability", "hexosome", "permeability", | |
| "sponge", "phase", "permeability", "bicontinuous", "microemulsion", "permeability", | |
| "water", "oil", "microemulsion", "permeability", "oil", "water", "microemulsion", | |
| "permeability", "water", "oil", "water", "microemulsion", "permeability", "oil", "water", | |
| "oil", "microemulsion", "permeability", "multiple", "emulsion", "permeability", "pickering", | |
| "emulsion", "permeability", "janus", "particle", "permeability", "core", "shell", | |
| "nanoparticle", "permeability", "hollow", "nanoparticle", "permeability", "porous", | |
| "nanoparticle", "permeability", "mesoporous", "nanoparticle", "permeability", "macroporous", | |
| "nanoparticle", "permeability", "microporous", "nanoparticle", "permeability", "hierarchical", | |
| "porous", "nanoparticle", "permeability", "ordered", "mesoporous", "nanoparticle", | |
| "permeability", "disordered", "mesoporous", "nanoparticle", "permeability", "periodic", | |
| "mesoporous", "nanoparticle", "permeability", "aperiodic", "mesoporous", "nanoparticle", | |
| "permeability", "crystalline", "mesoporous", "nanoparticle", "permeability", "amorphous", | |
| "mesoporous", "nanoparticle", "permeability", "hybrid", "mesoporous", "nanoparticle", | |
| "permeability", "organic", "inorganic", "hybrid", "mesoporous", "nanoparticle", "permeability", | |
| "metal", "organic", "framework", "permeability", "covalent", "organic", "framework", | |
| "permeability", "zeolitic", "imidazolate", "framework", "permeability", "coordination", | |
| "polymer", "permeability", "supramolecular", "polymer", "permeability", "dendrimer", | |
| "permeability", "hyperbranched", "polymer", "permeability", "star", "polymer", "permeability", | |
| "comb", "polymer", "permeability", "brush", "polymer", "permeability", "graft", "polymer", | |
| "permeability", "block", "copolymer", "permeability", "random", "copolymer", "permeability", | |
| "alternating", "copolymer", "permeability", "statistical", "copolymer", "permeability", | |
| "gradient", "copolymer", "permeability", "periodic", "copolymer", "permeability", | |
| "aperiodic", "copolymer", "permeability", "stereoregular", "copolymer", "permeability", | |
| "stereoirregular", "copolymer", "permeability", "tacticity", "permeability", "isotactic", | |
| "permeability", "syndiotactic", "permeability", "atactic", "permeability", "stereoblock", | |
| "copolymer", "permeability", "stereogradient", "copolymer", "permeability", "stereoperiodic", | |
| "copolymer", "permeability", "stereoaperiodic", "copolymer", "permeability", "stereoregular", | |
| "block", "copolymer", "permeability", "stereoirregular", "block", "copolymer", "permeability", | |
| "stereoregular", "random", "copolymer", "permeability", "stereoirregular", "random", | |
| "copolymer", "permeability", "stereoregular", "alternating", "copolymer", "permeability", | |
| "stereoirregular", "alternating", "copolymer", "permeability", "stereoregular", "statistical", | |
| "copolymer", "permeability", "stereoirregular", "statistical", "copolymer", "permeability", | |
| "stereoregular", "gradient", "copolymer", "permeability", "stereoirregular", "gradient", | |
| "copolymer", "permeability", "stereoregular", "periodic", "copolymer", "permeability", | |
| "stereoirregular", "periodic", "copolymer", "permeability", "stereoregular", "aperiodic", | |
| "copolymer", "permeability", "stereoirregular", "aperiodic", "copolymer", "permeability" | |
| } | |
| _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST} | |
| if _SPELL_EN: | |
| _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) | |
| if _SPELL_FR: | |
| _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) | |
| def _normalize_text(s: str) -> str: | |
| """Normalize text for better word extraction""" | |
| if not s: | |
| return "" | |
| # Unicode normalization | |
| s = unicodedata.normalize("NFC", s) | |
| # Fix common apostrophe issues | |
| s = s.replace("'", "'").replace("'", "'") | |
| # Normalize whitespace - replace multiple spaces with single space | |
| s = re.sub(r'\s+', ' ', s) | |
| # Remove leading/trailing whitespace | |
| s = s.strip() | |
| return s | |
| def _extract_tokens(raw: str): | |
| """Extract word tokens with improved filtering""" | |
| s = _normalize_text(raw or "") | |
| tokens = _WORD_RE.findall(s) | |
| # Filter out tokens that are too short or don't look like words | |
| filtered_tokens = [] | |
| for token in tokens: | |
| if len(token) >= 2 and _is_likely_word(token): | |
| filtered_tokens.append(token) | |
| return filtered_tokens | |
| def _looks_like_acronym(tok: str) -> bool: | |
| """Check if token looks like a valid acronym""" | |
| return tok.isupper() and 2 <= len(tok) <= 6 | |
| def _has_digits(tok: str) -> bool: | |
| """Check if token contains digits""" | |
| return any(ch.isdigit() for ch in tok) | |
| def _is_mostly_numbers(tok: str) -> bool: | |
| """Check if token is mostly numbers (should be ignored)""" | |
| if not tok: | |
| return False | |
| # Count digits and letters | |
| digit_count = sum(1 for ch in tok if ch.isdigit()) | |
| letter_count = sum(1 for ch in tok if ch.isalpha()) | |
| total_chars = len(tok) | |
| # If more than 70% digits, consider it mostly numbers | |
| if digit_count / total_chars > 0.7: | |
| return True | |
| # If it's a pure number (all digits), ignore it | |
| if digit_count == total_chars: | |
| return True | |
| # If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th") | |
| if total_chars >= 2 and digit_count >= 1: | |
| suffix = tok[-2:].lower() | |
| if suffix in ['st', 'nd', 'rd', 'th']: | |
| return True | |
| # If it's a decimal number (contains digits and decimal point) | |
| if '.' in tok and digit_count > 0: | |
| return True | |
| # If it's a percentage (ends with %) | |
| if tok.endswith('%') and digit_count > 0: | |
| return True | |
| return False | |
| def _is_likely_word(tok: str) -> bool: | |
| """Check if token looks like a real word (not random characters)""" | |
| if len(tok) < 2: | |
| return False | |
| # Filter out tokens that are mostly non-letter characters | |
| letter_count = sum(1 for c in tok if c.isalpha()) | |
| if letter_count < len(tok) * 0.6: # At least 60% letters | |
| return False | |
| # Filter out tokens with too many consecutive consonants/vowels | |
| vowels = set('aeiouAEIOU') | |
| consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ') | |
| # Check for excessive consonant clusters (like "qwerty" or "zxcvb") | |
| if len(tok) >= 4: | |
| consonant_clusters = 0 | |
| vowel_clusters = 0 | |
| for i in range(len(tok) - 2): | |
| if tok[i:i+3].lower() in consonants: | |
| consonant_clusters += 1 | |
| if tok[i:i+3].lower() in vowels: | |
| vowel_clusters += 1 | |
| # If more than half the possible clusters are consonant clusters, likely not a word | |
| if consonant_clusters > len(tok) * 0.3: | |
| return False | |
| # Filter out tokens that look like random keyboard patterns | |
| keyboard_patterns = [ | |
| 'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm', | |
| 'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk', | |
| '123456', '234567', '345678', '456789', '567890' | |
| ] | |
| tok_lower = tok.lower() | |
| for pattern in keyboard_patterns: | |
| if pattern in tok_lower or tok_lower in pattern: | |
| return False | |
| return True | |
| def _is_known_word(tok: str) -> bool: | |
| """Check if token is a known word with comprehensive filtering""" | |
| t = tok.lower() | |
| # First check if it looks like a real word | |
| if not _is_likely_word(tok): | |
| return True # Don't flag non-words as misspellings | |
| # Ignore numbers and mostly numeric tokens | |
| if _is_mostly_numbers(tok): | |
| return True # Don't flag numbers as misspellings | |
| # Check domain allowlist, acronyms, and words with digits | |
| if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok): | |
| return True | |
| # Check hyphenated words - if any part is known, consider the whole word known | |
| if '-' in tok: | |
| parts = tok.split('-') | |
| if all(_is_known_word(part) for part in parts): | |
| return True | |
| # Check against English spell checker | |
| if _SPELL_EN: | |
| try: | |
| # Check if word is known in English dictionary | |
| if not _SPELL_EN.unknown([t]): | |
| return True | |
| except Exception: | |
| pass | |
| # Check against French spell checker | |
| if _SPELL_FR: | |
| try: | |
| # Check if word is known in French dictionary | |
| if not _SPELL_FR.unknown([t]): | |
| return True | |
| except Exception: | |
| pass | |
| # Additional checks for common patterns | |
| # Check for common suffixes/prefixes that might not be in dictionaries | |
| common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible'] | |
| common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down'] | |
| # Check if word with common suffix/prefix is known | |
| for suffix in common_suffixes: | |
| if t.endswith(suffix) and len(t) > len(suffix) + 2: | |
| base_word = t[:-len(suffix)] | |
| if _SPELL_EN and not _SPELL_EN.unknown([base_word]): | |
| return True | |
| for prefix in common_prefixes: | |
| if t.startswith(prefix) and len(t) > len(prefix) + 2: | |
| base_word = t[len(prefix):] | |
| if _SPELL_EN and not _SPELL_EN.unknown([base_word]): | |
| return True | |
| # Check for plural forms (simple 's' ending) | |
| if t.endswith('s') and len(t) > 3: | |
| singular = t[:-1] | |
| if _SPELL_EN and not _SPELL_EN.unknown([singular]): | |
| return True | |
| return False | |
| # (optional) keep a compatibility shim so any other code calling normalize_token() won't break | |
| def normalize_token(token: str) -> str: | |
| toks = _extract_tokens(token) | |
| return (toks[0].lower() if toks else "") | |
| # -------------------- Helpers ---------------------- | |
| def _is_pdf(path: str) -> bool: | |
| return os.path.splitext(path.lower())[1] == ".pdf" | |
| def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool: | |
| """ | |
| Check if a box is in the excluded bottom area (115mm from bottom). | |
| Converts mm to pixels using DPI. | |
| """ | |
| # Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels | |
| excluded_height_pixels = int(excluded_height_mm * dpi / 25.4) | |
| # Calculate the top boundary of the excluded area | |
| excluded_top = image_height - excluded_height_pixels | |
| # Check if the box intersects with the excluded area | |
| return box.y1 >= excluded_top | |
| def _contains_validation_text(text: str) -> bool: | |
| """Check if text contains the validation text '50 Carroll'""" | |
| return "50 Carroll" in text | |
| def load_pdf_pages(path: str, dpi: int = 600, max_pages: int = 15) -> List[Image.Image]: | |
| """Load PDF pages as images with fallback options""" | |
| if not _is_pdf(path): | |
| return [Image.open(path).convert("RGB")] | |
| # Try pdf2image first | |
| poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None] | |
| for poppler_path in poppler_paths: | |
| try: | |
| if poppler_path: | |
| imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path) | |
| else: | |
| imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages) | |
| if imgs: | |
| return [img.convert("RGB") for img in imgs] | |
| except Exception: | |
| if poppler_path is None: # All pdf2image attempts failed | |
| break | |
| continue # Try next path | |
| # Fallback to PyMuPDF | |
| if HAS_PYMUPDF: | |
| try: | |
| doc = fitz.open(path) | |
| pages = [] | |
| for page_num in range(min(len(doc), max_pages)): | |
| page = doc[page_num] | |
| mat = fitz.Matrix(dpi/72, dpi/72) | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("ppm") | |
| img = Image.open(io.BytesIO(img_data)) | |
| pages.append(img.convert("RGB")) | |
| doc.close() | |
| return pages | |
| except Exception as e: | |
| raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}") | |
| raise ValueError("Failed to convert PDF to image. No working method available.") | |
| def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image: | |
| """Combine multiple pages into a single vertical image""" | |
| if not pages: | |
| raise ValueError("No pages to combine") | |
| if len(pages) == 1: | |
| return pages[0] | |
| # Find the maximum width | |
| max_width = max(page.width for page in pages) | |
| # Calculate total height | |
| total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1) | |
| # Create combined image | |
| combined = Image.new('RGB', (max_width, total_height), (255, 255, 255)) | |
| y_offset = 0 | |
| for page in pages: | |
| # Center the page horizontally if it's narrower than max_width | |
| x_offset = (max_width - page.width) // 2 | |
| combined.paste(page, (x_offset, y_offset)) | |
| y_offset += page.height + spacing | |
| return combined | |
| def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: | |
| if a.size == b.size: | |
| return a, b | |
| w, h = min(a.width, b.width), min(a.height, b.height) | |
| return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) | |
| def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: | |
| return ImageChops.difference(a, b) | |
| def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: | |
| arr = np.asarray(diff_img).astype(np.uint16) | |
| gray = arr.max(axis=2).astype(np.uint8) | |
| mask = (gray >= threshold).astype(np.uint8) | |
| mask = dilation(mask, rectangle(3, 3)) | |
| labeled = label(mask, connectivity=2) | |
| out: List[Box] = [] | |
| img_height = diff_img.height | |
| for p in regionprops(labeled): | |
| if p.area < min_area: | |
| continue | |
| minr, minc, maxr, maxc = p.bbox | |
| box = Box(minr, minc, maxr, maxc, int(p.area)) | |
| # Skip boxes in the excluded bottom area | |
| if _is_in_excluded_bottom_area(box, img_height): | |
| continue | |
| out.append(box) | |
| return out | |
| def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, | |
| width: int = 3) -> Image.Image: | |
| out = img.copy(); d = ImageDraw.Draw(out) | |
| # red (diff) | |
| for b in red_boxes: | |
| for w in range(width): | |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) | |
| # cyan (misspellings) | |
| for b in cyan_boxes: | |
| for w in range(width): | |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) | |
| # green (barcodes) | |
| if green_boxes: | |
| for b in green_boxes: | |
| for w in range(width): | |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) | |
| return out | |
| def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: | |
| A = np.asarray(a).copy(); B = np.asarray(b) | |
| mask = np.any(A != B, axis=2) | |
| A[mask] = [255, 0, 0] | |
| return Image.fromarray(A) | |
| # -------------------- OCR + Spellcheck ------------- | |
| from typing import List, Iterable, Optional | |
| from PIL import Image | |
| import unicodedata | |
| import regex as re | |
| import pytesseract | |
| try: | |
| from spellchecker import SpellChecker | |
| except ImportError: | |
| try: | |
| from pyspellchecker import SpellChecker | |
| except ImportError: | |
| SpellChecker = None | |
| # If these existed in your file, keep them; otherwise define defaults to avoid NameError | |
| try: | |
| HAS_OCR | |
| except NameError: | |
| HAS_OCR = True | |
| try: | |
| HAS_SPELLCHECK | |
| except NameError: | |
| HAS_SPELLCHECK = True | |
| # ---- spell/tokenization helpers & caches ---- | |
| _WORD_RE = re.compile(r"\p{Letter}+(?:[β'\-]\p{Letter}+)*", re.UNICODE) | |
| _SPELL_EN = SpellChecker(language="en") | |
| _SPELL_FR = SpellChecker(language="fr") | |
| _DOMAIN_ALLOWLIST = { | |
| "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", | |
| "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" | |
| } | |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) | |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) | |
| def _normalize_text(s: str) -> str: | |
| s = unicodedata.normalize("NFC", s) | |
| return s.replace("β", "'").strip() | |
| def _extract_tokens(raw: str): | |
| s = _normalize_text(raw or "") | |
| return _WORD_RE.findall(s) | |
| def _looks_like_acronym(tok: str) -> bool: | |
| return tok.isupper() and 2 <= len(tok) <= 6 | |
| def _has_digits(tok: str) -> bool: | |
| return any(ch.isdigit() for ch in tok) | |
| # (optional) keep a compatibility shim so any other code calling normalize_token() won't break | |
| def normalize_token(token: str) -> str: | |
| toks = _extract_tokens(token) | |
| return (toks[0].lower() if toks else "") | |
| def _get_available_tesseract_langs(): | |
| """Get available Tesseract languages""" | |
| try: | |
| langs = pytesseract.get_languages() | |
| if 'eng' in langs and 'fra' in langs: | |
| return "eng+fra" | |
| elif 'eng' in langs: | |
| return "eng" | |
| elif langs: | |
| return langs[0] | |
| else: | |
| return "eng" | |
| except Exception: | |
| return "eng" | |
| def prepare_for_ocr(img: Image.Image) -> Image.Image: | |
| """Prepare image for better OCR results""" | |
| from PIL import ImageOps, ImageFilter | |
| g = img.convert("L") | |
| g = ImageOps.autocontrast(g) | |
| g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2)) | |
| return g | |
| def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]: | |
| """Extract text directly from PDF using PyMuPDF""" | |
| if not HAS_PYMUPDF: | |
| return [] | |
| try: | |
| doc = fitz.open(path) | |
| texts = [] | |
| for page_num in range(min(len(doc), max_pages)): | |
| page = doc[page_num] | |
| text = page.get_text() | |
| texts.append(text) | |
| doc.close() | |
| return texts | |
| except Exception: | |
| return [] | |
| def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000): | |
| """Convert PDF coordinates to image coordinates""" | |
| pdf_width, pdf_height = pdf_page_size | |
| img_width, img_height = image_size | |
| # Scale factors | |
| scale_x = img_width / pdf_width | |
| scale_y = img_height / pdf_height | |
| # Convert PDF coordinates to image coordinates | |
| x1 = int(pdf_bbox[0] * scale_x) | |
| y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height) | |
| x2 = int(pdf_bbox[2] * scale_x) | |
| y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height) | |
| return x1, y1, x2, y2 | |
| def find_misspell_boxes_from_text( | |
| pdf_path: str, | |
| *, | |
| extra_allow: Optional[Iterable[str]] = None, | |
| max_pages: int = 5, | |
| image_size: Optional[Tuple[int, int]] = None | |
| ) -> List[Box]: | |
| """Find misspellings by analyzing extracted PDF text directly with coordinate mapping""" | |
| if not (HAS_SPELLCHECK and HAS_PYMUPDF): | |
| return [] | |
| # Load extra allowed words | |
| if extra_allow and _SPELL_EN: | |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) | |
| if extra_allow and _SPELL_FR: | |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) | |
| boxes: List[Box] = [] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| for page_num in range(min(len(doc), max_pages)): | |
| page = doc[page_num] | |
| # Get text with position information | |
| text_dict = page.get_text("dict") | |
| # Process each block of text | |
| for block in text_dict.get("blocks", []): | |
| if "lines" not in block: | |
| continue | |
| for line in block["lines"]: | |
| for span in line["spans"]: | |
| text = span.get("text", "").strip() | |
| if not text: | |
| continue | |
| # Extract tokens and check for misspellings | |
| tokens = _extract_tokens(text) | |
| has_misspelling = False | |
| for token in tokens: | |
| if len(token) >= 2 and not _is_known_word(token): | |
| has_misspelling = True | |
| break | |
| # If this span has misspellings, create a box for it | |
| if has_misspelling: | |
| bbox = span["bbox"] # [x0, y0, x1, y1] | |
| # Get page dimensions for coordinate conversion | |
| page_rect = page.rect | |
| pdf_width = page_rect.width | |
| pdf_height = page_rect.height | |
| # Calculate coordinates | |
| if image_size: | |
| img_width, img_height = image_size | |
| # Convert PDF coordinates to image coordinates | |
| scale_x = img_width / pdf_width | |
| scale_y = img_height / pdf_height | |
| x1 = int(bbox[0] * scale_x) | |
| y1 = int(bbox[1] * scale_y) + (page_num * img_height) | |
| x2 = int(bbox[2] * scale_x) | |
| y2 = int(bbox[3] * scale_y) + (page_num * img_height) | |
| else: | |
| x1 = int(bbox[0]) | |
| y1 = int(bbox[1]) + (page_num * 1000) | |
| x2 = int(bbox[2]) | |
| y2 = int(bbox[3]) + (page_num * 1000) | |
| # Create box | |
| box = Box(y1=y1, x1=x1, y2=y2, x2=x2, area=(x2 - x1) * (y2 - y1)) | |
| # Skip boxes in excluded bottom area | |
| if image_size: | |
| img_height = image_size[1] | |
| if _is_in_excluded_bottom_area(box, img_height): | |
| continue | |
| else: | |
| if _is_in_excluded_bottom_area(box, ph): | |
| continue | |
| boxes.append(box) | |
| doc.close() | |
| except Exception: | |
| # Fallback to simple text extraction if coordinate mapping fails | |
| page_texts = extract_pdf_text(pdf_path, max_pages) | |
| for page_num, text in enumerate(page_texts): | |
| if not text.strip(): | |
| continue | |
| tokens = _extract_tokens(text) | |
| misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)] | |
| if misspelled_words: | |
| # Create a placeholder box for the page | |
| placeholder_box = Box( | |
| y1=page_num * 1000, | |
| x1=0, | |
| y2=(page_num + 1) * 1000, | |
| x2=800, | |
| area=800 * 1000 | |
| ) | |
| # Skip if the placeholder box is in the excluded bottom area | |
| if image_size: | |
| img_height = image_size[1] | |
| if _is_in_excluded_bottom_area(placeholder_box, img_height): | |
| continue | |
| else: | |
| if _is_in_excluded_bottom_area(placeholder_box, 1000): | |
| continue | |
| boxes.append(placeholder_box) | |
| return boxes | |
| def find_misspell_boxes( | |
| img: Image.Image, | |
| *, | |
| min_conf: int = 60, | |
| lang: Optional[str] = None, | |
| extra_allow: Optional[Iterable[str]] = None, | |
| dpi: int = 300, | |
| psm: int = 6, | |
| oem: int = 3 | |
| ) -> List[Box]: | |
| """Legacy OCR-based spell checking (kept for fallback)""" | |
| if not (HAS_OCR and HAS_SPELLCHECK): | |
| return [] | |
| # Auto-detect language if not provided | |
| if lang is None: | |
| try: | |
| avail = set(pytesseract.get_languages(config="") or []) | |
| except Exception: | |
| avail = {"eng"} | |
| lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng" | |
| # OPTIONAL: light upscale if the image is small (heuristic) | |
| # target width ~ 2500β3000 px for letter-sized pages | |
| if img.width < 1600: | |
| scale = 2 | |
| img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS) | |
| # Prepare image for better OCR | |
| img = prepare_for_ocr(img) | |
| try: | |
| if extra_allow and _SPELL_EN: | |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) | |
| if extra_allow and _SPELL_FR: | |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) | |
| # Build a config that sets an explicit DPI and keeps spaces | |
| config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}" | |
| data = pytesseract.image_to_data( | |
| img, | |
| lang=lang, | |
| config=config, | |
| output_type=pytesseract.Output.DICT, | |
| ) | |
| except Exception: | |
| return [] | |
| n = len(data.get("text", [])) or 0 | |
| boxes: List[Box] = [] | |
| for i in range(n): | |
| raw = data["text"][i] | |
| if not raw: | |
| continue | |
| # confidence filter | |
| conf_str = data.get("conf", ["-1"])[i] | |
| try: | |
| conf = int(float(conf_str)) | |
| except Exception: | |
| conf = -1 | |
| if conf < min_conf: | |
| continue | |
| tokens = _extract_tokens(raw) | |
| if not tokens: | |
| continue | |
| # flag the box if ANY token in it looks misspelled | |
| if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens): | |
| continue | |
| left = data.get("left", [0])[i] | |
| top = data.get("top", [0])[i] | |
| width = data.get("width", [0])[i] | |
| height = data.get("height",[0])[i] | |
| if width <= 0 or height <= 0: | |
| continue | |
| # NOTE: adjust to match your Box constructor if needed | |
| b = Box(top, left, top + height, left + width, width * height) | |
| # Exclude bottom 115mm | |
| if _is_in_excluded_bottom_area(b, img.height): | |
| continue | |
| boxes.append(b) | |
| return boxes | |
| # deps: pip install zxing-cpp pyzbar pylibdmtx PyMuPDF pillow opencv-python-headless regex | |
| # system: macOS -> brew install zbar poppler ; Ubuntu -> sudo apt-get install libzbar0 poppler-utils | |
| import io, regex as re | |
| from typing import List, Tuple, Dict, Any | |
| from PIL import Image, ImageOps | |
| import numpy as np | |
| import fitz # PyMuPDF | |
| # Optional backends | |
| try: | |
| import zxingcpp; HAS_ZXING=True | |
| except Exception: HAS_ZXING=False | |
| def _zxing_hints_all(): | |
| if not HAS_ZXING: | |
| return None | |
| hints = zxingcpp.DecodeHints() | |
| # Work harder + allow rotated orientations internally (keeps coords correct) | |
| try: hints.try_harder = True | |
| except Exception: pass | |
| try: hints.try_rotate = True | |
| except Exception: pass | |
| # GS1 interpretation (FNC1) | |
| try: hints.is_gs1 = True | |
| except Exception: pass | |
| # Enable as many formats as the wrapper exposes (covers GS1 DataBar incl. stacked/expanded) | |
| BF = getattr(zxingcpp, "BarcodeFormat", None) | |
| mask = 0 | |
| for nm in [ | |
| "QR_CODE", "AZTEC", "PDF417", "DATA_MATRIX", "MAXICODE", | |
| "EAN_13", "EAN_8", "UPC_A", "UPC_E", | |
| "CODE_39", "CODE_93", "CODE_128", "ITF", "CODABAR", | |
| "RSS_14", "RSS_EXPANDED", "RSS_LIMITED", # AKA GS1 DataBar family | |
| "GS1_DATABAR", "GS1_DATABAR_EXPANDED", "GS1_DATABAR_LIMITED" # some wheels expose these names | |
| ]: | |
| val = getattr(BF, nm, None) | |
| if val is not None: | |
| mask |= int(val) | |
| if mask: | |
| hints.formats = mask | |
| return hints | |
| try: | |
| from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol; HAS_ZBAR=True | |
| except Exception: HAS_ZBAR=False; ZBarSymbol=None | |
| try: | |
| from pylibdmtx.pylibdmtx import decode as dmtx_decode; HAS_DMTX=True | |
| except Exception: HAS_DMTX=False | |
| try: | |
| import cv2; HAS_CV2=True | |
| except Exception: HAS_CV2=False | |
| # Consider barcode capability present if ANY backend is available | |
| HAS_ANY_BARCODE = any([locals().get("HAS_ZXING", False), | |
| locals().get("HAS_ZBAR", False), | |
| locals().get("HAS_DMTX", False), | |
| locals().get("HAS_CV2", False)]) | |
| # your Box(y1,x1,y2,x2,area) assumed to exist | |
| def _binarize(img: Image.Image) -> Image.Image: | |
| g = ImageOps.grayscale(img) | |
| g = ImageOps.autocontrast(g) | |
| return g.point(lambda x: 255 if x > 140 else 0, mode="1").convert("L") | |
| def _ean_checksum_ok(d: str) -> bool: | |
| if not d.isdigit(): return False | |
| n=len(d); nums=list(map(int,d)) | |
| if n==8: | |
| return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(7))%10))%10==nums[7] | |
| if n==12: | |
| return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(11))%10))%10==nums[11] | |
| if n==13: | |
| return (10 - (sum(nums[i]*(1 if i%2==0 else 3) for i in range(12))%10))%10==nums[12] | |
| return True | |
| def _normalize_upc_ean(sym: str, text: str): | |
| digits = re.sub(r"\D","",text or "") | |
| s = (sym or "").upper() | |
| if s in ("EAN13","EAN-13") and len(digits)==13 and digits.startswith("0"): | |
| return "UPCA", digits[1:] | |
| return s, (digits if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else text or "") | |
| def _validate(sym: str, payload: str) -> bool: | |
| s, norm = _normalize_upc_ean(sym, payload) | |
| return _ean_checksum_ok(norm) if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else bool(payload) | |
| def parse_gs1(text: str) -> Optional[dict]: | |
| if not text: return None | |
| # ZXing returns FNC1 as ASCII 29 (\x1D) for GS1-128/QR/DM | |
| s = text.replace("\x1D", ")(") # visual separator | |
| # Very lightweight AI parser for common AIs; extend as needed | |
| import re as _re | |
| ai_pat = _re.compile(r"\((\d{2,4})\)([^()]+)") | |
| out = {} | |
| for m in ai_pat.finditer(s): | |
| ai, val = m.group(1), m.group(2) | |
| out[ai] = val | |
| return out or None | |
| def _decode_zxing_all(pil: Image.Image) -> List[Dict[str, Any]]: | |
| if not HAS_ZXING: | |
| return [] | |
| arr = np.asarray(pil.convert("L")) | |
| # Try to use ReaderOptions if available (newer zxing-cpp) | |
| ReaderOptions = getattr(zxingcpp, "ReaderOptions", None) | |
| BarcodeFormat = getattr(zxingcpp, "BarcodeFormat", None) | |
| results = [] | |
| try: | |
| if ReaderOptions and BarcodeFormat: | |
| opts = ReaderOptions() | |
| # Enable wide coverage including GS1/stacked-capable formats | |
| opts.formats = ( | |
| BarcodeFormat.QR_CODE | getattr(BarcodeFormat, "MICRO_QR", 0) | | |
| BarcodeFormat.DATA_MATRIX | | |
| BarcodeFormat.PDF417 | # stacked rows | |
| BarcodeFormat.AZTEC | | |
| BarcodeFormat.MAXICODE | | |
| BarcodeFormat.EAN_13 | BarcodeFormat.EAN_8 | BarcodeFormat.UPC_A | getattr(BarcodeFormat, "UPC_E", 0) | | |
| BarcodeFormat.CODE_128 | BarcodeFormat.CODE_39 | getattr(BarcodeFormat, "CODE_93", 0) | | |
| BarcodeFormat.ITF | BarcodeFormat.CODABAR | | |
| getattr(BarcodeFormat, "RSS_14", 0) | getattr(BarcodeFormat, "RSS_EXPANDED", 0) # GS1 DataBar | |
| ) | |
| opts.try_harder = True | |
| opts.try_rotate = True | |
| # read_barcodes accepts numpy array + options | |
| zx = zxingcpp.read_barcodes(arr, opts) | |
| else: | |
| # Older binding: falls back to default behavior | |
| zx = zxingcpp.read_barcodes(arr) | |
| for r in zx or []: | |
| x1=y1=w=h=0 | |
| pos = getattr(r, "position", None) | |
| pts=[] | |
| if pos is not None: | |
| try: | |
| pts=list(pos) | |
| except TypeError: | |
| for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight", | |
| "point1","point2","point3","point4"): | |
| if hasattr(pos, name): | |
| p=getattr(pos,name) | |
| if hasattr(p,"x") and hasattr(p,"y"): | |
| pts.append(p) | |
| if pts: | |
| xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts] | |
| x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1 | |
| results.append({ | |
| "type": str(getattr(r,"format", "")), | |
| "data": getattr(r,"text","") or "", | |
| "left": x1, "top": y1, "width": w, "height": h | |
| }) | |
| except Exception: | |
| return [] | |
| return results | |
| def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]: | |
| if not HAS_ZBAR: | |
| return [] | |
| try: | |
| # Add more 1D formats ZBar supports | |
| syms = [] | |
| for nm in ("QRCODE","EAN13","EAN8","UPCA","UPCE","CODE128","CODE39","I25","CODABAR"): | |
| if hasattr(ZBarSymbol, nm): | |
| syms.append(getattr(ZBarSymbol, nm)) | |
| res = zbar_decode(pil, symbols=syms) if syms else zbar_decode(pil) | |
| out=[] | |
| for d in res: | |
| data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data) | |
| out.append({ | |
| "type": d.type, "data": data, | |
| "left": d.rect.left, "top": d.rect.top, | |
| "width": d.rect.width, "height": d.rect.height | |
| }) | |
| return out | |
| except Exception: | |
| return [] | |
| def _decode_dmtx(pil: Image.Image) -> List[Dict[str,Any]]: | |
| if not HAS_DMTX: return [] | |
| try: | |
| res=dmtx_decode(ImageOps.grayscale(pil)) | |
| return [{"type":"DATAMATRIX","data": r.data.decode("utf-8","ignore"), | |
| "left": r.rect.left, "top": r.rect.top, "width": r.rect.width, "height": r.rect.height} for r in res] | |
| except Exception: | |
| return [] | |
| def _decode_cv2_qr(pil: Image.Image) -> List[Dict[str,Any]]: | |
| if not HAS_CV2: return [] | |
| try: | |
| det=cv2.QRCodeDetector() | |
| g=np.asarray(pil.convert("L")) | |
| val, pts, _ = det.detectAndDecode(g) | |
| if val: | |
| if pts is not None and len(pts)>=1: | |
| pts=pts.reshape(-1,2); xs,ys=pts[:,0],pts[:,1] | |
| x1,x2=int(xs.min()),int(xs.max()); y1,y2=int(ys.min()),int(ys.max()) | |
| w,h=x2-x1,y2-y1 | |
| else: | |
| x1=y1=w=h=0 | |
| return [{"type":"QRCODE","data":val,"left":x1,"top":y1,"width":w,"height":h}] | |
| except Exception: | |
| pass | |
| return [] | |
| def _dedupe_hits(hits: List[Dict[str,Any]]) -> List[Dict[str,Any]]: | |
| seen=set(); out=[] | |
| for r in hits: | |
| # More aggressive deduplication based on content and approximate location | |
| data = r.get("data", "").strip() | |
| if not data: # Skip empty detections | |
| continue | |
| # Round coords to reduce jitter then dedupe | |
| key=(r.get("type",""), data, | |
| int(round(r.get("left",0)/10)*10), int(round(r.get("top",0)/10)*10)) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| out.append(r) | |
| return out | |
| def _decode_variants(pil: Image.Image) -> List[Dict[str,Any]]: | |
| # Start with original image only to avoid false positives | |
| variants=[pil] | |
| w,h = pil.size | |
| if max(w,h) < 1600: | |
| up = pil.resize((w*2,h*2), resample=Image.NEAREST) | |
| variants += [up] | |
| hits=[] | |
| for v in variants: | |
| # Only try original orientation to avoid coordinate mapping issues | |
| hits += _decode_zxing_all(v) | |
| hits += _decode_zbar(v) | |
| hits += _decode_dmtx(v) | |
| hits += _decode_cv2_qr(v) | |
| return _dedupe_hits(hits) | |
| def _pix_to_pil(pix) -> Image.Image: | |
| # convert PyMuPDF Pixmap to grayscale PIL without alpha (avoids blur) | |
| if pix.alpha: pix = fitz.Pixmap(pix, 0) | |
| try: | |
| pix = fitz.Pixmap(fitz.csGRAY, pix) | |
| except Exception: | |
| pass | |
| return Image.open(io.BytesIO(pix.tobytes("png"))) | |
| def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10): | |
| """Return (boxes, infos) from both rendered pages and embedded images.""" | |
| boxes=[]; infos=[] | |
| doc=fitz.open(pdf_path) | |
| n=min(len(doc), max_pages) | |
| for page_idx in range(n): | |
| page=doc[page_idx] | |
| # A) Embedded images (often crisp) | |
| for ix,(xref,*_) in enumerate(page.get_images(full=True)): | |
| try: | |
| pix=fitz.Pixmap(doc, xref) | |
| pil=_pix_to_pil(pix) | |
| hits=_decode_variants(pil) | |
| for r in hits: | |
| b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| # Exclude barcodes in the bottom 115mm of the page image | |
| if _is_in_excluded_bottom_area(b, pil.height): | |
| continue | |
| boxes.append(b) | |
| sym, payload = r["type"], r["data"] | |
| infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"}) | |
| except Exception: | |
| pass | |
| # B) Render page raster at high DPI (grayscale) | |
| for dpi in dpi_list: | |
| scale=dpi/72.0 | |
| try: | |
| pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False) | |
| except TypeError: | |
| pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False) | |
| pil=_pix_to_pil(pix) | |
| hits=_decode_variants(pil) | |
| for r in hits: | |
| b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| if _is_in_excluded_bottom_area(b, pil.height): | |
| continue | |
| boxes.append(b) | |
| sym, payload = r["type"], r["data"] | |
| infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"}) | |
| if any(i["page"]==page_idx+1 for i in infos): | |
| break # found something for this page β next page | |
| doc.close() | |
| return boxes, infos | |
| # -------------------- CMYK Panel ------------------- | |
| def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: | |
| return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255 | |
| def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: | |
| y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) | |
| x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) | |
| if y2<=y1 or x2<=x1: | |
| return (0.0,0.0,0.0,0.0) | |
| region = cmyk_arr[y1:y2, x1:x2, :] | |
| mean_vals = region.reshape(-1, 4).mean(axis=0) | |
| return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) | |
| def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): | |
| a_cmyk = rgb_to_cmyk_array(a_img) | |
| b_cmyk = rgb_to_cmyk_array(b_img) | |
| entries = [] | |
| for i, bx in enumerate(red_boxes): | |
| a_vals = avg_cmyk_in_box(a_cmyk, bx) | |
| b_vals = avg_cmyk_in_box(b_cmyk, bx) | |
| delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) | |
| entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) | |
| return entries | |
| def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: | |
| w,h = base.size | |
| panel = Image.new('RGB', (panel_width, h), (245,245,245)) | |
| out = Image.new('RGB', (w+panel_width, h), (255,255,255)) | |
| out.paste(base, (0,0)); out.paste(panel, (w,0)) | |
| d = ImageDraw.Draw(out) | |
| x0 = w + 8; y = 8 | |
| d.text((x0, y), title, fill=(0,0,0)); y += 18 | |
| if not entries: | |
| d.text((x0, y), 'No differing regions', fill=(80,80,80)) | |
| return out | |
| for e in entries: | |
| idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] | |
| d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 | |
| d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 | |
| d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 | |
| d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 | |
| if y > h - 40: break | |
| return out | |
| # -------------------- Gradio Interface ----------------- | |
| def _contains_50_carroll(pdf_path: str) -> bool: | |
| """Check if PDF contains the text '50 carroll' (case insensitive)""" | |
| try: | |
| if not HAS_PYMUPDF: | |
| return True # Skip validation if PyMuPDF not available | |
| doc = fitz.open(pdf_path) | |
| for page_num in range(min(len(doc), 5)): # Check first 5 pages | |
| page = doc[page_num] | |
| text = page.get_text().lower() | |
| if "50 carroll" in text: | |
| doc.close() | |
| return True | |
| doc.close() | |
| return False | |
| except Exception: | |
| return True # Skip validation on error | |
| def compare_pdfs(file_a, file_b): | |
| """Main comparison function for Gradio interface""" | |
| try: | |
| if file_a is None or file_b is None: | |
| return None, None, None, "β Please upload both PDF files to compare", [], [] | |
| # Check for "50 carroll" text in both files | |
| if not _contains_50_carroll(file_a.name) or not _contains_50_carroll(file_b.name): | |
| return None, None, None, "β Invalid File type", [], [] | |
| # Load images with multiple pages support | |
| pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15) | |
| pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15) | |
| # Combine pages into single images for comparison | |
| a = combine_pages_vertically(pages_a) | |
| b = combine_pages_vertically(pages_b) | |
| # Match sizes | |
| a, b = match_sizes(a, b) | |
| # Find differences with default settings | |
| diff = difference_map(a, b) | |
| red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) | |
| # Run all analysis features with defaults | |
| # Use text-based spell checking instead of OCR for better accuracy | |
| # Pass image dimensions for proper coordinate mapping | |
| image_size = (a.width, a.height) | |
| misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else [] | |
| misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else [] | |
| # Debug: Print spell check results | |
| print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes") | |
| if HAS_ANY_BARCODE: | |
| try: | |
| print(f"Starting barcode detection for file A: {file_a.name}") | |
| bar_a, info_a = find_barcodes_in_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcodes_in_image(a) | |
| print(f"Barcode detection A complete: {len(bar_a)} boxes, {len(info_a)} infos") | |
| print(f"Starting barcode detection for file B: {file_b.name}") | |
| bar_b, info_b = find_barcodes_in_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcodes_in_image(b) | |
| print(f"Barcode detection B complete: {len(bar_b)} boxes, {len(info_b)} infos") | |
| except Exception as e: | |
| print(f"Barcode detection error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| bar_a, info_a = [], [] | |
| bar_b, info_b = [], [] | |
| else: | |
| print("No barcode backends available") | |
| bar_a, info_a = [], [] | |
| bar_b, info_b = [], [] | |
| # Always enable CMYK analysis | |
| cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) | |
| # Create visualizations with default box width | |
| a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3) | |
| b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3) | |
| # Always show CMYK panel | |
| a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') | |
| b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') | |
| # Create pixel difference overlay | |
| overlay = make_red_overlay(a, b) | |
| # Create status message | |
| status = f""" | |
| π **Analysis Complete!** | |
| - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)} | |
| - **Difference regions found:** {len(red_boxes)} | |
| - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} | |
| - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} | |
| - **Combined image dimensions:** {a.width} Γ {a.height} pixels | |
| **Legend:** | |
| - π΄ Red boxes: Visual differences | |
| - π΅ Cyan boxes: Spelling errors | |
| - π’ Green boxes: Barcodes/QR codes | |
| """ | |
| # Prepare barcode data for tables | |
| codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), | |
| c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] | |
| codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), | |
| c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] | |
| return overlay, a_disp, b_disp, status, codes_a, codes_b | |
| except Exception as e: | |
| error_msg = f"β **Error:** {str(e)}" | |
| return None, None, None, error_msg, [], [] | |
| # -------------------- Gradio App ------------------- | |
| def create_demo(): | |
| # Create custom theme with light blue background | |
| # Create a simple, working theme with supported parameters only | |
| custom_theme = gr.themes.Soft( | |
| primary_hue="blue", | |
| neutral_hue="blue", | |
| font=gr.themes.GoogleFont("Inter"), | |
| ).set( | |
| body_background_fill="#99cfe9", # Light blue background | |
| body_background_fill_dark="#99cfe9", | |
| block_background_fill="#000000", # Black blocks for contrast | |
| block_background_fill_dark="#000000", | |
| border_color_primary="#333333", # Dark borders | |
| border_color_primary_dark="#333333", | |
| ) | |
| with gr.Blocks(title="PDF Comparison Tool", theme=custom_theme) as demo: | |
| gr.Markdown(""" | |
| # π Advanced PDF Comparison Tool | |
| Upload two PDF files to get comprehensive analysis including: | |
| - **Multi-page PDF support** (up to 15 pages per document) | |
| - **Visual differences** with bounding boxes | |
| - **OCR and spell checking** | |
| - **Barcode/QR code detection** | |
| - **CMYK color analysis** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_a = gr.File(label="π PDF A (Reference)", file_types=[".pdf"]) | |
| file_b = gr.File(label="π PDF B (Comparison)", file_types=[".pdf"]) | |
| compare_btn = gr.Button("π Compare PDF Files", variant="primary", size="lg") | |
| status_md = gr.Markdown("") | |
| with gr.Row(): | |
| overlay_img = gr.Image(label="π΄ Pixel Differences (Red = Different)", type="pil") | |
| with gr.Row(): | |
| img_a = gr.Image(label="π File A with Analysis", type="pil") | |
| img_b = gr.Image(label="π File B with Analysis", type="pil") | |
| gr.Markdown("### π Barcode Detection Results") | |
| with gr.Row(): | |
| codes_a_df = gr.Dataframe( | |
| headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], | |
| label="Barcodes in File A", | |
| interactive=False | |
| ) | |
| codes_b_df = gr.Dataframe( | |
| headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], | |
| label="Barcodes in File B", | |
| interactive=False | |
| ) | |
| # Event handlers | |
| compare_btn.click( | |
| fn=compare_pdfs, | |
| inputs=[file_a, file_b], | |
| outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] | |
| ) | |
| gr.Markdown(""" | |
| ### π Instructions: | |
| 1. Upload two PDF files | |
| 2. Click "Compare PDF Files" | |
| 3. View results with comprehensive analysis | |
| ### π¨ Color Legend: | |
| - **π΄ Red boxes:** Visual differences between files | |
| - **π΅ Cyan boxes:** Potential spelling errors (OCR) | |
| - **π’ Green boxes:** Detected barcodes/QR codes | |
| - **π Side panel:** CMYK color analysis for print workflows | |
| """) | |
| return demo | |
| def _binarize(pil_img: Image.Image) -> Image.Image: | |
| """Create a binarized (black/white) version of the image for better barcode detection""" | |
| g = ImageOps.grayscale(pil_img) | |
| g = ImageOps.autocontrast(g) | |
| return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L') | |
| def _decode_once(img: Image.Image): | |
| """Single decode attempt with common barcode symbols""" | |
| # Only use pyzbar if available, otherwise rely on ZXing-CPP | |
| syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128] | |
| return zbar_decode(img, symbols=syms) | |
| def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2): | |
| """ | |
| Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues. | |
| This function: | |
| - Renders pages at 600/900/1200 DPI | |
| - Tries grayscale, binarized, and rotated versions | |
| - Scans embedded images (XObjects) | |
| - Prints what it finds and writes debug PNGs | |
| - Helps identify if barcodes are too thin/low resolution | |
| Usage: | |
| debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2) | |
| """ | |
| if not HAS_PYMUPDF: | |
| print("ERROR: Missing PyMuPDF dependency") | |
| return | |
| os.makedirs(outdir, exist_ok=True) | |
| doc = fitz.open(pdf_path) | |
| for dpi in (600, 900, 1200): | |
| scale = dpi / 72.0 | |
| mat = fitz.Matrix(scale, scale) | |
| print(f"\n=== DPI {dpi} ===") | |
| for p in range(min(len(doc), max_pages)): | |
| page = doc[p] | |
| pix = page.get_pixmap(matrix=mat, alpha=False) | |
| img = Image.open(io.BytesIO(pix.tobytes("ppm"))) | |
| img.save(f"{outdir}/page{p+1}_{dpi}.png") | |
| # Try different image variants | |
| variants = [ | |
| ("orig", img), | |
| ("gray", ImageOps.grayscale(img)), | |
| ("bin", _binarize(img)), | |
| ] | |
| found = [] | |
| for tag, v in variants: | |
| r = _decode_once(v) | |
| if r: | |
| found.extend((tag, rr.type, rr.data) for rr in r) | |
| else: | |
| # Try rotations | |
| for angle in (90, 180, 270): | |
| rr = _decode_once(v.rotate(angle, expand=True)) | |
| if rr: | |
| found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr) | |
| break | |
| print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}") | |
| # Scan embedded images too | |
| imgs = page.get_images(full=True) | |
| for ix, (xref, *_) in enumerate(imgs): | |
| try: | |
| ipix = fitz.Pixmap(doc, xref) | |
| if ipix.alpha: | |
| ipix = fitz.Pixmap(ipix, 0) | |
| pil = Image.open(io.BytesIO(ipix.tobytes("ppm"))) | |
| pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png") | |
| rr = _decode_once(pil) or _decode_once(_binarize(pil)) | |
| if rr: | |
| print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}") | |
| except Exception as e: | |
| print(" Embedded image error:", e) | |
| doc.close() | |
| print(f"\nDebug images saved to: {outdir}/") | |
| print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.") | |
| def find_barcodes_in_pdf(pdf_path: str, image_size: Optional[Tuple[int,int]]=None, max_pages: int = 10): | |
| boxes: List[Box] = []; infos: List[Dict[str,Any]]=[] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| n = min(len(doc), max_pages) | |
| y_offset = 0 | |
| target_width = int(image_size[0]) if image_size else None | |
| for page_idx in range(n): | |
| page = doc[page_idx] | |
| if target_width: | |
| scale = max(1.0, float(target_width)/float(page.rect.width)) | |
| else: | |
| scale = 600.0/72.0 | |
| try: | |
| pix = page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False) | |
| except TypeError: | |
| pix = page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False) | |
| pil = _pix_to_pil(pix) | |
| # 1) embedded XObjects (often crisp) | |
| for ix,(xref,*_) in enumerate(page.get_images(full=True)): | |
| try: | |
| epix = fitz.Pixmap(doc, xref) | |
| epil = _pix_to_pil(epix) | |
| for r in _decode_variants(epil): | |
| # Check if barcode is in the excluded bottom 115mm area | |
| per_page_box = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| effective_dpi = int(round(72.0 * scale)) | |
| if _is_in_excluded_bottom_area(per_page_box, pil.size[1], excluded_height_mm=115.0, dpi=effective_dpi): | |
| continue | |
| b = Box(r["top"]+y_offset, r["left"], r["top"]+y_offset+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| boxes.append(b) | |
| infos.append({**r, "valid": _validate(r.get("type",""), r.get("data","")), "page": page_idx+1, "source": f"embed:{ix+1}"}) | |
| except Exception: | |
| pass | |
| # 2) page raster | |
| for r in _decode_variants(pil): | |
| # Check if barcode is in the excluded bottom 115mm area | |
| per_page_box = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| effective_dpi = int(round(72.0 * scale)) | |
| if _is_in_excluded_bottom_area(per_page_box, pil.size[1], excluded_height_mm=115.0, dpi=effective_dpi): | |
| continue | |
| b = Box(r["top"]+y_offset, r["left"], r["top"]+y_offset+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| boxes.append(b) | |
| infos.append({**r, "valid": _validate(r.get("type",""), r.get("data","")), "page": page_idx+1, "source": f"page@scale{scale:.2f}"}) | |
| y_offset += pil.size[1] | |
| doc.close() | |
| except Exception: | |
| return [], [] | |
| return boxes, infos | |
| def find_barcodes_in_image(pil: Image.Image): | |
| boxes: List[Box] = []; infos: List[Dict[str,Any]]=[] | |
| for r in _decode_variants(pil): | |
| b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]) | |
| boxes.append(b) | |
| infos.append({**r, "valid": _validate(r.get("type",""), r.get("data","")), "page": 1, "source": "image"}) | |
| return boxes, infos | |
| def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10): | |
| """Detect barcodes from the original PDF and return boxes in the same | |
| coordinate space as the combined display image. | |
| If image_size is provided (w,h of the vertically combined display image), | |
| each page is rendered so its width matches w, then decoded. Box y-coordinates | |
| are offset by the cumulative height of previous pages so that all boxes map | |
| into the combined image space correctly. | |
| """ | |
| boxes: List[Box] = [] | |
| infos: List[Dict[str, Any]] = [] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| num_pages = min(len(doc), max_pages) | |
| if num_pages == 0: | |
| return [], [] | |
| target_width = None | |
| if image_size: | |
| target_width = int(image_size[0]) | |
| y_offset = 0 | |
| for page_idx in range(num_pages): | |
| page = doc[page_idx] | |
| # Compute scale so that rendered width matches target_width when provided | |
| if target_width: | |
| page_width_pts = float(page.rect.width) # 72 dpi units | |
| scale = max(1.0, target_width / page_width_pts) | |
| else: | |
| scale = 600.0 / 72.0 # ~600 dpi default | |
| try: | |
| pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False) | |
| except TypeError: | |
| pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False) | |
| pil = _pix_to_pil(pix) | |
| pw, ph = pil.size | |
| effective_dpi = 72.0 * scale # <-- this is the real DPI for this rendered page | |
| hits = _decode_variants(pil) | |
| for r in hits: | |
| x1 = int(r.get("left", 0)) | |
| y1 = int(r.get("top", 0)) | |
| w = int(r.get("width", 0)) | |
| h = int(r.get("height", 0)) | |
| x2 = x1 + w | |
| y2 = y1 + h | |
| # Per-page box (before stacking) | |
| per_page_box = Box(y1, x1, y2, x2, w*h) | |
| # Exclude the bottom 115mm of THIS PAGE using the correct DPI | |
| if _is_in_excluded_bottom_area(per_page_box, ph, excluded_height_mm=115.0, dpi=int(effective_dpi)): | |
| continue | |
| # Map to combined image by adding the current page's y-offset | |
| combined_box = Box(y1 + y_offset, x1, y2 + y_offset, x2, w*h) | |
| boxes.append(combined_box) | |
| sym, payload = r.get("type",""), r.get("data","") | |
| infos.append({ | |
| **r, | |
| "valid": _validate(sym, payload), | |
| "page": page_idx + 1, | |
| "source": f"page@dpi{int(effective_dpi)}" | |
| }) | |
| # Add GS1 parsing if available | |
| gs1 = parse_gs1(payload) | |
| if gs1: infos[-1]["gs1"] = gs1 | |
| y_offset += ph | |
| doc.close() | |
| except Exception: | |
| return [], [] | |
| return boxes, infos | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.launch( | |
| server_name="0.0.0.0", # Allow external access | |
| share=True, # Set to True to create a public link | |
| show_error=True | |
| ) | |