Spaces:

Digitaljoint
/

ProofCheck

Sleeping

App Files Files Community

Yaz Hobooti commited on Sep 27, 2025

Commit

1baa0bd

1 Parent(s): 7de2fa8

Complete setup: Add app.py, update requirements.txt and README.md

Browse files

Files changed (3) hide show

README.md +34 -5
app.py +1382 -0
requirements.txt +7 -1

README.md CHANGED Viewed

@@ -1,13 +1,42 @@
 ---
 title: ProofCheck
-emoji: 🔥
-colorFrom: indigo
 colorTo: purple
 sdk: gradio
-sdk_version: 5.47.2
 app_file: app.py
 pinned: false
-short_description: ProofCheck
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: ProofCheck
+emoji: 🔍
+colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: "4.44.0"
 app_file: app.py
 pinned: false
+license: apache-2.0
+tags:
+  - document-processing
+  - pdf
+  - ocr
+  - comparator
+task_categories:
+  - other
+pretty_name: ProofCheck
 ---
+# 🔍 Advanced PDF Comparison Tool
+Upload two PDF files to get comprehensive analysis including:
+- **Visual differences** with bounding boxes
+- **OCR and spell checking**
+- **Barcode/QR code detection**
+- **CMYK color analysis**
+## Features
+- High-DPI PDF rendering (600 DPI) for improved OCR and barcode recognition
+- Rule-based text and layout comparison
+- Export of comparison results
+## Usage
+Run locally:
+```bash
+python app.py
+```
+## License
+Apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,1382 @@

+#!/usr/bin/env python3
+"""
+Gradio PDF Comparison Tool
+Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
+"""
+import os, sys, re, csv, json, io
+from dataclasses import dataclass
+from typing import List, Tuple, Optional, Iterable
+import tempfile
+import unicodedata
+import numpy as np
+from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
+from pdf2image import convert_from_path
+from skimage.measure import label, regionprops
+from skimage.morphology import dilation, rectangle
+import gradio as gr
+# Alternative PDF processing
+try:
+    import fitz  # PyMuPDF
+    HAS_PYMUPDF = True
+except Exception:
+    fitz = None
+    HAS_PYMUPDF = False
+# Optional features
+try:
+    import pytesseract
+    HAS_OCR = True
+except Exception:
+    pytesseract = None
+    HAS_OCR = False
+try:
+    from spellchecker import SpellChecker
+    HAS_SPELLCHECK = True
+except Exception:
+    SpellChecker = None
+    HAS_SPELLCHECK = False
+try:
+    import regex as re
+    HAS_REGEX = True
+except Exception:
+    import re
+    HAS_REGEX = False
+try:
+    from pyzbar.pyzbar import decode as zbar_decode
+    HAS_BARCODE = True
+except Exception:
+    zbar_decode = None
+    HAS_BARCODE = False
+# -------------------- Core Data --------------------
+@dataclass
+class Box:
+    y1: int; x1: int; y2: int; x2: int; area: int
+# ---- spell/tokenization helpers & caches ----
+if HAS_REGEX:
+    # Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
+    _WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
+else:
+    # Fallback regex for basic ASCII
+    _WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
+if HAS_SPELLCHECK:
+    # Initialize English spell checker with comprehensive dictionary
+    _SPELL_EN = SpellChecker(language="en")
+    # Try to initialize French spell checker with fallback
+    _SPELL_FR = None
+    try:
+        _SPELL_FR = SpellChecker(language="fr")
+    except Exception:
+        # If French dictionary fails, try alternative approach
+        try:
+            _SPELL_FR = SpellChecker()
+            # Load some basic French words manually if needed
+        except Exception:
+            _SPELL_FR = None
+            print("Warning: French spell checker not available")
+else:
+    _SPELL_EN = None
+    _SPELL_FR = None
+_DOMAIN_ALLOWLIST = {
+    # Company/Brand names
+    "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
+    "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid",
+    # Technical terms
+    "CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP",
+    "Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure",
+    "Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock",
+    # Common abbreviations
+    "Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd",
+    "USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU",
+    # French words (common in Canadian context)
+    "Québec", "Montréal", "Toronto", "Vancouver", "Ottawa", "Calgary",
+    "français", "française", "anglais", "anglaise", "bilingue",
+    # Common business terms
+    "Marketing", "Sales", "Customer", "Service", "Quality", "Control",
+    "Management", "Administration", "Production", "Manufacturing",
+    "Distribution", "Logistics", "Supply", "Chain", "Inventory",
+    # Common words that might be flagged
+    "Email", "Website", "Online", "Internet", "Software", "Hardware",
+    "Database", "System", "Network", "Server", "Client", "User",
+    "Password", "Login", "Logout", "Account", "Profile", "Settings",
+    "Configuration", "Installation", "Maintenance", "Support",
+    # Numbers and measurements
+    "mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in",
+    "x", "by", "times", "multiply", "divide", "plus", "minus",
+    # Common misspellings that are actually correct in context
+    "colour", "colour", "favour", "favour", "honour", "honour",
+    "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
+    "theatre", "theatre", "metre", "metre", "litre", "litre",
+    # Pharmaceutical terms
+    "glycerol", "sativa","tocophersolan", "tocopherol", "tocopheryl", "acetate",
+    "ascorbic", "ascorbate", "retinol", "retinyl", "palmitate",
+    "stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate",
+    "eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
+    "hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl",
+    "phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide",
+    "sodium", "potassium", "calcium", "magnesium", "zinc", "iron",
+    "copper", "manganese", "selenium", "chromium", "molybdenum",
+    "thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine",
+    "biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol",
+    "phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine",
+    "creatine", "taurine", "glutamine", "arginine", "lysine", "leucine",
+    "isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
+    "cysteine", "tyrosine", "histidine", "proline", "serine", "threonine",
+    "asparagine", "glutamic", "aspartic", "alanine", "glycine",
+    "polysorbate", "monostearate", "distearate", "tristearate",
+    "polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl",
+    "cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose",
+    "microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
+    "maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol",
+    "stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
+    "eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic",
+    "conjugated", "linoleic", "acid", "ester", "amide", "anhydride",
+    "hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
+    "phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate",
+    "malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate",
+    "palmitate", "stearate", "oleate", "linoleate", "arachidonate"
+}
+_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
+if _SPELL_EN:
+    _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
+if _SPELL_FR:
+    _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
+def _normalize_text(s: str) -> str:
+    """Normalize text for better word extraction"""
+    if not s:
+        return ""
+    # Unicode normalization
+    s = unicodedata.normalize("NFC", s)
+    # Fix common apostrophe issues
+    s = s.replace("'", "'").replace("'", "'")
+    # Normalize whitespace - replace multiple spaces with single space
+    s = re.sub(r'\s+', ' ', s)
+    # Remove leading/trailing whitespace
+    s = s.strip()
+    return s
+def _extract_tokens(raw: str):
+    """Extract word tokens with improved filtering"""
+    s = _normalize_text(raw or "")
+    tokens = _WORD_RE.findall(s)
+    # Filter out tokens that are too short or don't look like words
+    filtered_tokens = []
+    for token in tokens:
+        if len(token) >= 2 and _is_likely_word(token):
+            filtered_tokens.append(token)
+    return filtered_tokens
+def _looks_like_acronym(tok: str) -> bool:
+    """Check if token looks like a valid acronym"""
+    return tok.isupper() and 2 <= len(tok) <= 6
+def _has_digits(tok: str) -> bool:
+    """Check if token contains digits"""
+    return any(ch.isdigit() for ch in tok)
+def _is_mostly_numbers(tok: str) -> bool:
+    """Check if token is mostly numbers (should be ignored)"""
+    if not tok:
+        return False
+    # Count digits and letters
+    digit_count = sum(1 for ch in tok if ch.isdigit())
+    letter_count = sum(1 for ch in tok if ch.isalpha())
+    total_chars = len(tok)
+    # If more than 70% digits, consider it mostly numbers
+    if digit_count / total_chars > 0.7:
+        return True
+    # If it's a pure number (all digits), ignore it
+    if digit_count == total_chars:
+        return True
+    # If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
+    if total_chars >= 2 and digit_count >= 1:
+        suffix = tok[-2:].lower()
+        if suffix in ['st', 'nd', 'rd', 'th']:
+            return True
+    # If it's a decimal number (contains digits and decimal point)
+    if '.' in tok and digit_count > 0:
+        return True
+    # If it's a percentage (ends with %)
+    if tok.endswith('%') and digit_count > 0:
+        return True
+    return False
+def _is_likely_word(tok: str) -> bool:
+    """Check if token looks like a real word (not random characters)"""
+    if len(tok) < 2:
+        return False
+    # Filter out tokens that are mostly non-letter characters
+    letter_count = sum(1 for c in tok if c.isalpha())
+    if letter_count < len(tok) * 0.6:  # At least 60% letters
+        return False
+    # Filter out tokens with too many consecutive consonants/vowels
+    vowels = set('aeiouAEIOU')
+    consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
+    # Check for excessive consonant clusters (like "qwerty" or "zxcvb")
+    if len(tok) >= 4:
+        consonant_clusters = 0
+        vowel_clusters = 0
+        for i in range(len(tok) - 2):
+            if tok[i:i+3].lower() in consonants:
+                consonant_clusters += 1
+            if tok[i:i+3].lower() in vowels:
+                vowel_clusters += 1
+        # If more than half the possible clusters are consonant clusters, likely not a word
+        if consonant_clusters > len(tok) * 0.3:
+            return False
+    # Filter out tokens that look like random keyboard patterns
+    keyboard_patterns = [
+        'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
+        'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
+        '123456', '234567', '345678', '456789', '567890'
+    ]
+    tok_lower = tok.lower()
+    for pattern in keyboard_patterns:
+        if pattern in tok_lower or tok_lower in pattern:
+            return False
+    return True
+def _is_known_word(tok: str) -> bool:
+    """Check if token is a known word with comprehensive filtering"""
+    t = tok.lower()
+    # First check if it looks like a real word
+    if not _is_likely_word(tok):
+        return True  # Don't flag non-words as misspellings
+    # Ignore numbers and mostly numeric tokens
+    if _is_mostly_numbers(tok):
+        return True  # Don't flag numbers as misspellings
+    # Check domain allowlist, acronyms, and words with digits
+    if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
+        return True
+    # Check hyphenated words - if any part is known, consider the whole word known
+    if '-' in tok:
+        parts = tok.split('-')
+        if all(_is_known_word(part) for part in parts):
+            return True
+    # Check against English spell checker
+    if _SPELL_EN:
+        try:
+            # Check if word is known in English dictionary
+            if not _SPELL_EN.unknown([t]):
+                return True
+        except Exception:
+            pass
+    # Check against French spell checker
+    if _SPELL_FR:
+        try:
+            # Check if word is known in French dictionary
+            if not _SPELL_FR.unknown([t]):
+                return True
+        except Exception:
+            pass
+    # Additional checks for common patterns
+    # Check for common suffixes/prefixes that might not be in dictionaries
+    common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible']
+    common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down']
+    # Check if word with common suffix/prefix is known
+    for suffix in common_suffixes:
+        if t.endswith(suffix) and len(t) > len(suffix) + 2:
+            base_word = t[:-len(suffix)]
+            if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
+                return True
+    for prefix in common_prefixes:
+        if t.startswith(prefix) and len(t) > len(prefix) + 2:
+            base_word = t[len(prefix):]
+            if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
+                return True
+    # Check for plural forms (simple 's' ending)
+    if t.endswith('s') and len(t) > 3:
+        singular = t[:-1]
+        if _SPELL_EN and not _SPELL_EN.unknown([singular]):
+            return True
+    return False
+# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
+def normalize_token(token: str) -> str:
+    toks = _extract_tokens(token)
+    return (toks[0].lower() if toks else "")
+# -------------------- Helpers ----------------------
+def _is_pdf(path: str) -> bool:
+    return os.path.splitext(path.lower())[1] == ".pdf"
+def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
+    """
+    Check if a box is in the excluded bottom area (115mm from bottom).
+    Converts mm to pixels using DPI.
+    """
+    # Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
+    excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
+    # Calculate the top boundary of the excluded area
+    excluded_top = image_height - excluded_height_pixels
+    # Check if the box intersects with the excluded area
+    return box.y1 >= excluded_top
+def _contains_validation_text(text: str) -> bool:
+    """Check if text contains the validation text '50 Carroll'"""
+    return "50 Carroll" in text
+def load_pdf_pages(path: str, dpi: int = 600, max_pages: int = 15) -> List[Image.Image]:
+    """Load PDF pages as images with fallback options"""
+    if not _is_pdf(path):
+        return [Image.open(path).convert("RGB")]
+    # Try pdf2image first
+    poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
+    for poppler_path in poppler_paths:
+        try:
+            if poppler_path:
+                imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
+            else:
+                imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
+            if imgs:
+                return [img.convert("RGB") for img in imgs]
+        except Exception:
+            if poppler_path is None:  # All pdf2image attempts failed
+                break
+            continue  # Try next path
+    # Fallback to PyMuPDF
+    if HAS_PYMUPDF:
+        try:
+            doc = fitz.open(path)
+            pages = []
+            for page_num in range(min(len(doc), max_pages)):
+                page = doc[page_num]
+                mat = fitz.Matrix(dpi/72, dpi/72)
+                pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("ppm")
+                img = Image.open(io.BytesIO(img_data))
+                pages.append(img.convert("RGB"))
+            doc.close()
+            return pages
+        except Exception as e:
+            raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}")
+    raise ValueError("Failed to convert PDF to image. No working method available.")
+def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
+    """Combine multiple pages into a single vertical image"""
+    if not pages:
+        raise ValueError("No pages to combine")
+    if len(pages) == 1:
+        return pages[0]
+    # Find the maximum width
+    max_width = max(page.width for page in pages)
+    # Calculate total height
+    total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
+    # Create combined image
+    combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
+    y_offset = 0
+    for page in pages:
+        # Center the page horizontally if it's narrower than max_width
+        x_offset = (max_width - page.width) // 2
+        combined.paste(page, (x_offset, y_offset))
+        y_offset += page.height + spacing
+    return combined
+def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
+    if a.size == b.size:
+        return a, b
+    w, h = min(a.width, b.width), min(a.height, b.height)
+    return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))
+def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
+    return ImageChops.difference(a, b)
+def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
+    arr = np.asarray(diff_img).astype(np.uint16)
+    gray = arr.max(axis=2).astype(np.uint8)
+    mask = (gray >= threshold).astype(np.uint8)
+    mask = dilation(mask, rectangle(3, 3))
+    labeled = label(mask, connectivity=2)
+    out: List[Box] = []
+    img_height = diff_img.height
+    for p in regionprops(labeled):
+        if p.area < min_area:
+            continue
+        minr, minc, maxr, maxc = p.bbox
+        box = Box(minr, minc, maxr, maxc, int(p.area))
+        # Skip boxes in the excluded bottom area
+        if _is_in_excluded_bottom_area(box, img_height):
+            continue
+        out.append(box)
+    return out
+def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
+                     width: int = 3) -> Image.Image:
+    out = img.copy(); d = ImageDraw.Draw(out)
+    # red (diff)
+    for b in red_boxes:
+        for w in range(width):
+            d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
+    # cyan (misspellings)
+    for b in cyan_boxes:
+        for w in range(width):
+            d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
+    # green (barcodes)
+    if green_boxes:
+        for b in green_boxes:
+            for w in range(width):
+                d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
+    return out
+def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
+    A = np.asarray(a).copy(); B = np.asarray(b)
+    mask = np.any(A != B, axis=2)
+    A[mask] = [255, 0, 0]
+    return Image.fromarray(A)
+# -------------------- OCR + Spellcheck -------------
+from typing import List, Iterable, Optional
+from PIL import Image
+import unicodedata
+import regex as re
+import pytesseract
+from spellchecker import SpellChecker
+# If these existed in your file, keep them; otherwise define defaults to avoid NameError
+try:
+    HAS_OCR
+except NameError:
+    HAS_OCR = True
+try:
+    HAS_SPELLCHECK
+except NameError:
+    HAS_SPELLCHECK = True
+# ---- spell/tokenization helpers & caches ----
+_WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)
+_SPELL_EN = SpellChecker(language="en")
+_SPELL_FR = SpellChecker(language="fr")
+_DOMAIN_ALLOWLIST = {
+    "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
+    "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
+}
+_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
+_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
+def _normalize_text(s: str) -> str:
+    s = unicodedata.normalize("NFC", s)
+    return s.replace("’", "'").strip()
+def _extract_tokens(raw: str):
+    s = _normalize_text(raw or "")
+    return _WORD_RE.findall(s)
+def _looks_like_acronym(tok: str) -> bool:
+    return tok.isupper() and 2 <= len(tok) <= 6
+def _has_digits(tok: str) -> bool:
+    return any(ch.isdigit() for ch in tok)
+# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
+def normalize_token(token: str) -> str:
+    toks = _extract_tokens(token)
+    return (toks[0].lower() if toks else "")
+def _get_available_tesseract_langs():
+    """Get available Tesseract languages"""
+    try:
+        langs = pytesseract.get_languages()
+        if 'eng' in langs and 'fra' in langs:
+            return "eng+fra"
+        elif 'eng' in langs:
+            return "eng"
+        elif langs:
+            return langs[0]
+        else:
+            return "eng"
+    except Exception:
+        return "eng"
+def prepare_for_ocr(img: Image.Image) -> Image.Image:
+    """Prepare image for better OCR results"""
+    from PIL import ImageOps, ImageFilter
+    g = img.convert("L")
+    g = ImageOps.autocontrast(g)
+    g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
+    return g
+def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
+    """Extract text directly from PDF using PyMuPDF"""
+    if not HAS_PYMUPDF:
+        return []
+    try:
+        doc = fitz.open(path)
+        texts = []
+        for page_num in range(min(len(doc), max_pages)):
+            page = doc[page_num]
+            text = page.get_text()
+            texts.append(text)
+        doc.close()
+        return texts
+    except Exception:
+        return []
+def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
+    """Convert PDF coordinates to image coordinates"""
+    pdf_width, pdf_height = pdf_page_size
+    img_width, img_height = image_size
+    # Scale factors
+    scale_x = img_width / pdf_width
+    scale_y = img_height / pdf_height
+    # Convert PDF coordinates to image coordinates
+    x1 = int(pdf_bbox[0] * scale_x)
+    y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
+    x2 = int(pdf_bbox[2] * scale_x)
+    y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
+    return x1, y1, x2, y2
+def find_misspell_boxes_from_text(
+    pdf_path: str,
+    *,
+    extra_allow: Optional[Iterable[str]] = None,
+    max_pages: int = 5,
+    image_size: Optional[Tuple[int, int]] = None
+) -> List[Box]:
+    """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
+    if not (HAS_SPELLCHECK and HAS_PYMUPDF):
+        return []
+    # Load extra allowed words
+    if extra_allow and _SPELL_EN:
+        _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
+    if extra_allow and _SPELL_FR:
+        _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
+    boxes: List[Box] = []
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in range(min(len(doc), max_pages)):
+            page = doc[page_num]
+            # Get text with position information
+            text_dict = page.get_text("dict")
+            # Process each block of text
+            for block in text_dict.get("blocks", []):
+                if "lines" not in block:
+                    continue
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        text = span.get("text", "").strip()
+                        if not text:
+                            continue
+                        # Extract tokens and check for misspellings
+                        tokens = _extract_tokens(text)
+                        has_misspelling = False
+                        for token in tokens:
+                            if len(token) >= 2 and not _is_known_word(token):
+                                has_misspelling = True
+                                break
+                        # If this span has misspellings, create a box for it
+                        if has_misspelling:
+                            bbox = span["bbox"]  # [x0, y0, x1, y1]
+                            # Get page dimensions for coordinate conversion
+                            page_rect = page.rect
+                            pdf_width = page_rect.width
+                            pdf_height = page_rect.height
+                            # Calculate coordinates
+                            if image_size:
+                                img_width, img_height = image_size
+                                # Convert PDF coordinates to image coordinates
+                                scale_x = img_width / pdf_width
+                                scale_y = img_height / pdf_height
+                                x1 = int(bbox[0] * scale_x)
+                                y1 = int(bbox[1] * scale_y) + (page_num * img_height)
+                                x2 = int(bbox[2] * scale_x)
+                                y2 = int(bbox[3] * scale_y) + (page_num * img_height)
+                            else:
+                                x1 = int(bbox[0])
+                                y1 = int(bbox[1]) + (page_num * 1000)
+                                x2 = int(bbox[2])
+                                y2 = int(bbox[3]) + (page_num * 1000)
+                            # Create box
+                            box = Box(y1=y1, x1=x1, y2=y2, x2=x2, area=(x2 - x1) * (y2 - y1))
+                            # Skip boxes in excluded bottom area unless they contain validation text
+                            if image_size:
+                                img_height = image_size[1]
+                                if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
+                                    continue
+                            else:
+                                if _is_in_excluded_bottom_area(box, ph):
+                                    continue
+                            boxes.append(box)
+        doc.close()
+    except Exception:
+        # Fallback to simple text extraction if coordinate mapping fails
+        page_texts = extract_pdf_text(pdf_path, max_pages)
+        for page_num, text in enumerate(page_texts):
+            if not text.strip():
+                    continue
+            tokens = _extract_tokens(text)
+            misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]
+            if misspelled_words:
+                # Create a placeholder box for the page
+                boxes.append(Box(
+                    y1=page_num * 1000,
+                    x1=0,
+                    y2=(page_num + 1) * 1000,
+                    x2=800,
+                    area=800 * 1000
+                ))
+    return boxes
+def find_misspell_boxes(
+    img: Image.Image,
+    *,
+    min_conf: int = 60,
+    lang: Optional[str] = None,
+    extra_allow: Optional[Iterable[str]] = None,
+    dpi: int = 300,
+    psm: int = 6,
+    oem: int = 3
+) -> List[Box]:
+    """Legacy OCR-based spell checking (kept for fallback)"""
+    if not (HAS_OCR and HAS_SPELLCHECK):
+        return []
+    # Auto-detect language if not provided
+    if lang is None:
+        try:
+            avail = set(pytesseract.get_languages(config="") or [])
+        except Exception:
+            avail = {"eng"}
+        lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
+    # OPTIONAL: light upscale if the image is small (heuristic)
+    # target width ~ 2500–3000 px for letter-sized pages
+    if img.width < 1600:
+        scale = 2
+        img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
+    # Prepare image for better OCR
+    img = prepare_for_ocr(img)
+    try:
+        if extra_allow and _SPELL_EN:
+            _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
+        if extra_allow and _SPELL_FR:
+            _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
+        # Build a config that sets an explicit DPI and keeps spaces
+        config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
+        data = pytesseract.image_to_data(
+            img,
+            lang=lang,
+            config=config,
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception:
+        return []
+    n = len(data.get("text", [])) or 0
+    boxes: List[Box] = []
+    for i in range(n):
+        raw = data["text"][i]
+        if not raw:
+                    continue
+        # confidence filter
+        conf_str = data.get("conf", ["-1"])[i]
+        try:
+            conf = int(float(conf_str))
+        except Exception:
+            conf = -1
+        if conf < min_conf:
+            continue
+        tokens = _extract_tokens(raw)
+        if not tokens:
+            continue
+        # flag the box if ANY token in it looks misspelled
+        if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
+            continue
+        left   = data.get("left",  [0])[i]
+        top    = data.get("top",   [0])[i]
+        width  = data.get("width", [0])[i]
+        height = data.get("height",[0])[i]
+        if width <= 0 or height <= 0:
+            continue
+        # NOTE: adjust to match your Box constructor if needed
+        b = Box(top, left, top + height, left + width, width * height)
+        # Exclude bottom 115mm unless the text contains the validation phrase
+        if _is_in_excluded_bottom_area(b, img.height) and not _contains_validation_text(raw):
+            continue
+        boxes.append(b)
+    return boxes
+# deps: pip install zxing-cpp pyzbar pylibdmtx PyMuPDF pillow opencv-python-headless regex
+# system: macOS -> brew install zbar poppler ; Ubuntu -> sudo apt-get install libzbar0 poppler-utils
+import io, regex as re
+from typing import List, Tuple, Dict, Any
+from PIL import Image, ImageOps
+import numpy as np
+import fitz  # PyMuPDF
+# Optional backends
+try:
+    import zxingcpp; HAS_ZXING=True
+except Exception: HAS_ZXING=False
+try:
+    from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol; HAS_ZBAR=True
+except Exception: HAS_ZBAR=False; ZBarSymbol=None
+try:
+    from pylibdmtx.pylibdmtx import decode as dmtx_decode; HAS_DMTX=True
+except Exception: HAS_DMTX=False
+try:
+    import cv2; HAS_CV2=True
+except Exception: HAS_CV2=False
+# your Box(y1,x1,y2,x2,area) assumed to exist
+def _binarize(img: Image.Image) -> Image.Image:
+    g = ImageOps.grayscale(img)
+    g = ImageOps.autocontrast(g)
+    return g.point(lambda x: 255 if x > 140 else 0, mode="1").convert("L")
+def _ean_checksum_ok(d: str) -> bool:
+    if not d.isdigit(): return False
+    n=len(d); nums=list(map(int,d))
+    if n==8:
+        return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(7))%10))%10==nums[7]
+    if n==12:
+        return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(11))%10))%10==nums[11]
+    if n==13:
+        return (10 - (sum(nums[i]*(1 if i%2==0 else 3) for i in range(12))%10))%10==nums[12]
+    return True
+def _normalize_upc_ean(sym: str, text: str):
+    digits = re.sub(r"\D","",text or "")
+    s = (sym or "").upper()
+    if s in ("EAN13","EAN-13") and len(digits)==13 and digits.startswith("0"):
+        return "UPCA", digits[1:]
+    return s, (digits if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else text or "")
+def _validate(sym: str, payload: str) -> bool:
+    s, norm = _normalize_upc_ean(sym, payload)
+    return _ean_checksum_ok(norm) if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else bool(payload)
+def _decode_zxing(pil: Image.Image) -> List[Dict[str,Any]]:
+    if not HAS_ZXING: return []
+    arr = np.asarray(pil.convert("L"))
+    out=[]
+    for r in zxingcpp.read_barcodes(arr):  # try_harder is default True in recent builds; otherwise supply options
+        # zxingcpp.Position may be iterable (sequence of points) or an object with corner attributes
+        x1=y1=x2=y2=w=h=0
+        pos = getattr(r, "position", None)
+        pts: List[Any] = []
+        if pos is not None:
+            try:
+                pts = list(pos)  # works if iterable
+            except TypeError:
+                # Fall back to known corner attribute names across versions
+                corner_names = (
+                    "top_left", "topLeft",
+                    "top_right", "topRight",
+                    "bottom_left", "bottomLeft",
+                    "bottom_right", "bottomRight",
+                    "point1", "point2", "point3", "point4",
+                )
+                seen=set()
+                for name in corner_names:
+                    if hasattr(pos, name):
+                        p = getattr(pos, name)
+                        # avoid duplicates
+                        if id(p) not in seen and hasattr(p, "x") and hasattr(p, "y"):
+                            pts.append(p)
+                            seen.add(id(p))
+        if pts:
+            xs=[int(getattr(p, "x", 0)) for p in pts]
+            ys=[int(getattr(p, "y", 0)) for p in pts]
+            x1,x2=min(xs),max(xs); y1,y2=min(ys),max(ys)
+            w,h=x2-x1,y2-y1
+        out.append({
+            "type": str(r.format),
+            "data": r.text or "",
+            "left": x1,
+            "top": y1,
+            "width": w,
+            "height": h,
+        })
+    return out
+def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
+    if not HAS_ZBAR: return []
+    syms=[ZBarSymbol.QRCODE,ZBarSymbol.EAN13,ZBarSymbol.EAN8,ZBarSymbol.UPCA,ZBarSymbol.CODE128] if ZBarSymbol else None
+    res=zbar_decode(pil, symbols=syms) if syms else zbar_decode(pil)
+    return [{"type": d.type, "data": (d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)),
+             "left": d.rect.left, "top": d.rect.top, "width": d.rect.width, "height": d.rect.height} for d in res]
+def _decode_dmtx(pil: Image.Image) -> List[Dict[str,Any]]:
+    if not HAS_DMTX: return []
+    try:
+        res=dmtx_decode(ImageOps.grayscale(pil))
+        return [{"type":"DATAMATRIX","data": r.data.decode("utf-8","ignore"),
+                 "left": r.rect.left, "top": r.rect.top, "width": r.rect.width, "height": r.rect.height} for r in res]
+    except Exception:
+        return []
+def _decode_cv2_qr(pil: Image.Image) -> List[Dict[str,Any]]:
+    if not HAS_CV2: return []
+    try:
+        det=cv2.QRCodeDetector()
+        g=np.asarray(pil.convert("L"))
+        val, pts, _ = det.detectAndDecode(g)
+        if val:
+            if pts is not None and len(pts)>=1:
+                pts=pts.reshape(-1,2); xs,ys=pts[:,0],pts[:,1]
+                x1,x2=int(xs.min()),int(xs.max()); y1,y2=int(ys.min()),int(ys.max())
+                w,h=x2-x1,y2-y1
+            else:
+                x1=y1=w=h=0
+            return [{"type":"QRCODE","data":val,"left":x1,"top":y1,"width":w,"height":h}]
+    except Exception:
+        pass
+    return []
+def _decode_variants(pil: Image.Image) -> List[Dict[str,Any]]:
+    variants=[pil, ImageOps.grayscale(pil), _binarize(pil)]
+    # upsample small images with NEAREST to keep bars crisp
+    w,h=pil.size
+    if max(w,h)<1600:
+        up=pil.resize((w*2,h*2), resample=Image.NEAREST)
+        variants += [up, _binarize(up)]
+    for v in variants:
+        # ZXing first (broad coverage), then ZBar, then DMTX, then cv2 QR
+        res = _decode_zxing(v)
+        if res: return res
+        res = _decode_zbar(v)
+        if res: return res
+        res = _decode_dmtx(v)
+        if res: return res
+        res = _decode_cv2_qr(v)
+        if res: return res
+        # try rotations
+        for angle in (90,180,270):
+            r=v.rotate(angle, expand=True)
+            res = _decode_zxing(r) or _decode_zbar(r) or _decode_dmtx(r) or _decode_cv2_qr(r)
+            if res: return res
+    return []
+def _pix_to_pil(pix) -> Image.Image:
+    # convert PyMuPDF Pixmap to grayscale PIL without alpha (avoids blur)
+    if pix.alpha: pix = fitz.Pixmap(pix, 0)
+    try:
+        pix = fitz.Pixmap(fitz.csGRAY, pix)
+    except Exception:
+        pass
+    return Image.open(io.BytesIO(pix.tobytes("png")))
+def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
+    """Return (boxes, infos) from both rendered pages and embedded images."""
+    boxes=[]; infos=[]
+    doc=fitz.open(pdf_path)
+    n=min(len(doc), max_pages)
+    for page_idx in range(n):
+        page=doc[page_idx]
+        # A) Embedded images (often crisp)
+        for ix,(xref,*_) in enumerate(page.get_images(full=True)):
+            try:
+                pix=fitz.Pixmap(doc, xref)
+                pil=_pix_to_pil(pix)
+                hits=_decode_variants(pil)
+                for r in hits:
+                    b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
+                    # Exclude barcodes in the bottom 115mm of the page image
+                    if _is_in_excluded_bottom_area(b, pil.height):
+                        continue
+                    boxes.append(b)
+                    sym, payload = r["type"], r["data"]
+                    infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
+            except Exception:
+                pass
+        # B) Render page raster at high DPI (grayscale)
+        for dpi in dpi_list:
+            scale=dpi/72.0
+            try:
+                pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False)
+            except TypeError:
+                pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False)
+            pil=_pix_to_pil(pix)
+            hits=_decode_variants(pil)
+            for r in hits:
+                b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
+                if _is_in_excluded_bottom_area(b, pil.height):
+                    continue
+                boxes.append(b)
+                sym, payload = r["type"], r["data"]
+                infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
+            if any(i["page"]==page_idx+1 for i in infos):
+                break  # found something for this page → next page
+    doc.close()
+    return boxes, infos
+# -------------------- CMYK Panel -------------------
+def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
+    return np.asarray(img.convert('CMYK')).astype(np.float32)  # 0..255
+def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
+    y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
+    x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
+    if y2<=y1 or x2<=x1:
+        return (0.0,0.0,0.0,0.0)
+    region = cmyk_arr[y1:y2, x1:x2, :]
+    mean_vals = region.reshape(-1, 4).mean(axis=0)
+    return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)
+def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
+    a_cmyk = rgb_to_cmyk_array(a_img)
+    b_cmyk = rgb_to_cmyk_array(b_img)
+    entries = []
+    for i, bx in enumerate(red_boxes):
+        a_vals = avg_cmyk_in_box(a_cmyk, bx)
+        b_vals = avg_cmyk_in_box(b_cmyk, bx)
+        delta  = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
+        entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
+    return entries
+def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
+    w,h = base.size
+    panel = Image.new('RGB', (panel_width, h), (245,245,245))
+    out = Image.new('RGB', (w+panel_width, h), (255,255,255))
+    out.paste(base, (0,0)); out.paste(panel, (w,0))
+    d = ImageDraw.Draw(out)
+    x0 = w + 8; y = 8
+    d.text((x0, y), title, fill=(0,0,0)); y += 18
+    if not entries:
+        d.text((x0, y), 'No differing regions', fill=(80,80,80))
+        return out
+    for e in entries:
+        idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
+        d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
+        d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
+        d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
+        d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
+        if y > h - 40: break
+    return out
+# -------------------- Gradio Interface -----------------
+def compare_pdfs(file_a, file_b):
+    """Main comparison function for Gradio interface"""
+    try:
+        if file_a is None or file_b is None:
+            return None, None, None, "❌ Please upload both PDF files to compare", [], []
+        # Load images with multiple pages support
+        pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
+        pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
+        # Combine pages into single images for comparison
+        a = combine_pages_vertically(pages_a)
+        b = combine_pages_vertically(pages_b)
+        # Match sizes
+        a, b = match_sizes(a, b)
+        # Find differences with default settings
+        diff = difference_map(a, b)
+        red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
+        # Run all analysis features with defaults
+        # Use text-based spell checking instead of OCR for better accuracy
+        # Pass image dimensions for proper coordinate mapping
+        image_size = (a.width, a.height)
+        misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
+        misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
+        # Debug: Print spell check results
+        print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
+        if HAS_BARCODE:
+            # Use PDF-based barcode detection instead of rasterized image
+            bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
+            bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
+            # Debug: Print barcode detection results
+            print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
+        else:
+            bar_a, info_a = [], []
+            bar_b, info_b = [], []
+        # Always enable CMYK analysis
+        cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
+        # Create visualizations with default box width
+        a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3)
+        b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3)
+        # Always show CMYK panel
+        a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
+        b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
+        # Create pixel difference overlay
+        overlay = make_red_overlay(a, b)
+        # Create status message
+        status = f"""
+        📊 **Analysis Complete!**
+        - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
+        - **Difference regions found:** {len(red_boxes)}
+        - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
+        - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
+        - **Combined image dimensions:** {a.width} × {a.height} pixels
+        **Legend:**
+        - 🔴 Red boxes: Visual differences
+        - 🔵 Cyan boxes: Spelling errors
+        - 🟢 Green boxes: Barcodes/QR codes
+        """
+        # Prepare barcode data for tables
+        codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
+                   c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
+        codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
+                   c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]
+        return overlay, a_disp, b_disp, status, codes_a, codes_b
+    except Exception as e:
+        error_msg = f"❌ **Error:** {str(e)}"
+        return None, None, None, error_msg, [], []
+# -------------------- Gradio App -------------------
+def create_demo():
+    # Create custom theme with light blue background
+    # Create a simple, working theme with supported parameters only
+    custom_theme = gr.themes.Soft(
+        primary_hue="blue",
+        neutral_hue="blue",
+        font=gr.themes.GoogleFont("Inter"),
+    ).set(
+        body_background_fill="#99cfe9",  # Light blue background
+        body_background_fill_dark="#99cfe9",
+        block_background_fill="#000000",  # Black blocks for contrast
+        block_background_fill_dark="#000000",
+        border_color_primary="#333333",  # Dark borders
+        border_color_primary_dark="#333333",
+    )
+    with gr.Blocks(title="PDF Comparison Tool", theme=custom_theme) as demo:
+        gr.Markdown("""
+        # 🔍 Advanced PDF Comparison Tool
+        Upload two PDF files to get comprehensive analysis including:
+        - **Multi-page PDF support** (up to 15 pages per document)
+        - **Visual differences** with bounding boxes
+        - **OCR and spell checking**
+        - **Barcode/QR code detection**
+        - **CMYK color analysis**
+        """)
+        with gr.Row():
+            with gr.Column():
+                file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"])
+                file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"])
+        compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg")
+        status_md = gr.Markdown("")
+        with gr.Row():
+            overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil")
+        with gr.Row():
+            img_a = gr.Image(label="📄 File A with Analysis", type="pil")
+            img_b = gr.Image(label="📄 File B with Analysis", type="pil")
+        gr.Markdown("### 📊 Barcode Detection Results")
+        with gr.Row():
+            codes_a_df = gr.Dataframe(
+                headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
+                label="Barcodes in File A",
+                interactive=False
+            )
+            codes_b_df = gr.Dataframe(
+                headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
+                label="Barcodes in File B",
+                interactive=False
+            )
+        # Event handlers
+        compare_btn.click(
+            fn=compare_pdfs,
+            inputs=[file_a, file_b],
+            outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
+        )
+        gr.Markdown("""
+        ### 📝 Instructions:
+        1. Upload two PDF files
+        2. Click "Compare PDF Files"
+        3. View results with comprehensive analysis
+        ### 🎨 Color Legend:
+        - **🔴 Red boxes:** Visual differences between files
+        - **🔵 Cyan boxes:** Potential spelling errors (OCR)
+        - **🟢 Green boxes:** Detected barcodes/QR codes
+        - **📊 Side panel:** CMYK color analysis for print workflows
+        """)
+    return demo
+def _binarize(pil_img: Image.Image) -> Image.Image:
+    """Create a binarized (black/white) version of the image for better barcode detection"""
+    g = ImageOps.grayscale(pil_img)
+    g = ImageOps.autocontrast(g)
+    return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L')
+def _decode_once(img: Image.Image):
+    """Single decode attempt with common barcode symbols"""
+    if not HAS_BARCODE:
+            return []
+    syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128]
+    return zbar_decode(img, symbols=syms)
+def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
+    """
+    Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues.
+    This function:
+    - Renders pages at 600/900/1200 DPI
+    - Tries grayscale, binarized, and rotated versions
+    - Scans embedded images (XObjects)
+    - Prints what it finds and writes debug PNGs
+    - Helps identify if barcodes are too thin/low resolution
+    Usage:
+        debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2)
+    """
+    if not (HAS_BARCODE and HAS_PYMUPDF):
+        print("ERROR: Missing dependencies (pyzbar or PyMuPDF)")
+        return
+    os.makedirs(outdir, exist_ok=True)
+    doc = fitz.open(pdf_path)
+    for dpi in (600, 900, 1200):
+        scale = dpi / 72.0
+        mat = fitz.Matrix(scale, scale)
+        print(f"\n=== DPI {dpi} ===")
+        for p in range(min(len(doc), max_pages)):
+            page = doc[p]
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            img = Image.open(io.BytesIO(pix.tobytes("ppm")))
+            img.save(f"{outdir}/page{p+1}_{dpi}.png")
+            # Try different image variants
+            variants = [
+                ("orig", img),
+                ("gray", ImageOps.grayscale(img)),
+                ("bin", _binarize(img)),
+            ]
+            found = []
+            for tag, v in variants:
+                r = _decode_once(v)
+                if r:
+                    found.extend((tag, rr.type, rr.data) for rr in r)
+            else:
+                    # Try rotations
+                    for angle in (90, 180, 270):
+                        rr = _decode_once(v.rotate(angle, expand=True))
+                        if rr:
+                            found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr)
+                        break
+            print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}")
+            # Scan embedded images too
+            imgs = page.get_images(full=True)
+            for ix, (xref, *_) in enumerate(imgs):
+                try:
+                    ipix = fitz.Pixmap(doc, xref)
+                    if ipix.alpha:
+                        ipix = fitz.Pixmap(ipix, 0)
+                    pil = Image.open(io.BytesIO(ipix.tobytes("ppm")))
+                    pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png")
+                    rr = _decode_once(pil) or _decode_once(_binarize(pil))
+                    if rr:
+                        print(f"  Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
+                except Exception as e:
+                    print("  Embedded image error:", e)
+    doc.close()
+    print(f"\nDebug images saved to: {outdir}/")
+    print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
+def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
+    """Detect barcodes from the original PDF and return boxes in the same
+    coordinate space as the combined display image.
+    If image_size is provided (w,h of the vertically combined display image),
+    each page is rendered so its width matches w, then decoded. Box y-coordinates
+    are offset by the cumulative height of previous pages so that all boxes map
+    into the combined image space correctly.
+    """
+    boxes: List[Box] = []
+    infos: List[Dict[str, Any]] = []
+    try:
+        doc = fitz.open(pdf_path)
+        num_pages = min(len(doc), max_pages)
+        if num_pages == 0:
+            return [], []
+        target_width = None
+        if image_size:
+            target_width = int(image_size[0])
+        y_offset = 0
+        for page_idx in range(num_pages):
+            page = doc[page_idx]
+            # Compute scale so that rendered width matches target_width when provided
+            if target_width:
+                page_width_pts = float(page.rect.width)  # points (72 dpi)
+                scale = max(1.0, target_width / page_width_pts)
+            else:
+                # fallback dpi ~600
+                scale = 600.0 / 72.0
+            try:
+                pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False)
+            except TypeError:
+                pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
+            pil = _pix_to_pil(pix)
+            pw, ph = pil.size
+            hits = _decode_variants(pil)
+            for r in hits:
+                x1 = int(r.get("left", 0))
+                y1 = int(r.get("top", 0)) + y_offset
+                w = int(r.get("width", 0))
+                h = int(r.get("height", 0))
+                x2 = x1 + w
+                y2 = y1 + h
+                b = Box(y1, x1, y2, x2, w * h)
+                # Exclude bottom 115mm for combined image if we know full height; else per-page
+                if image_size and _is_in_excluded_bottom_area(b, image_size[1]):
+                    continue
+                if not image_size and _is_in_excluded_bottom_area(b, ph):
+                    continue
+                boxes.append(b)
+                sym, payload = r.get("type", ""), r.get("data", "")
+                infos.append({**r, "valid": _validate(sym, payload), "page": page_idx + 1, "source": f"page@scale{scale:.2f}"})
+            y_offset += ph
+        doc.close()
+    except Exception:
+        return [], []
+    return boxes, infos
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(
+        server_name="0.0.0.0",  # Allow external access
+        share=True,  # Set to True to create a public link
+        show_error=True
+    )

requirements.txt CHANGED Viewed

@@ -4,4 +4,10 @@ pillow
 pdf2image
 gradio
 PyMuPDF>=1.24
-pytesseract

 pdf2image
 gradio
 PyMuPDF>=1.24
+pytesseract
+spellchecker
+regex
+pyzbar
+zxing-cpp
+pylibdmtx
+scikit-image