# app.py
import streamlit as st
import fitz  # PyMuPDF
import pdfplumber
import camelot
import json
import tempfile
import os
import re
import base64
from io import BytesIO
from statistics import mean, pstdev

# Optional OCR
try:
    import pytesseract
    from PIL import Image
    OCR_AVAILABLE = True
except Exception:
    OCR_AVAILABLE = False

EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(\+?\d{1,3})?[\s\-.(]*(\d{2,4})[\s\-.)]*(\d{3,4})[\s\-]*(\d{3,4})")
URL_RE = re.compile(r"(https?://\S+|www\.\S+)")
CIN_RE = re.compile(r"\bCIN\b.*", flags=re.IGNORECASE)

def image_bytes_to_base64(img_bytes, mime="image/png"):
    b64 = base64.b64encode(img_bytes).decode("utf-8")
    return f"data:{mime};base64,{b64}"

def detect_headings(spans):
    """
    Heuristic detection of sections/subsections using font sizes in spans.
    spans: list of (text, size, flags, font)
    Returns thresholds (section_threshold, subsection_threshold)
    """
    sizes = [s for (_, s, _, _) in spans if s > 0]
    if not sizes:
        return (16, 12)
    avg = mean(sizes)
    sd = pstdev(sizes) if len(sizes) > 1 else 0
    # Section threshold: avg + 1*sd or at least 14
    section_t = max(14, avg + sd)
    subsection_t = max(11, avg)
    return (section_t, subsection_t)

def classify_footer_and_signature(lines):
    """
    Given list of lines (strings) attempt to classify footer, signature, or normal.
    Returns (type, combined_text) where type in {"footer","signature","paragraph"}.
    """
    combined = "\n".join(lines).strip()
    # Look for signature clues
    if any(x in combined.lower() for x in ["yours sincerely", "yours faithfully", "for "]) or re.search(r"\b(dean|director|manager|ceo|coo)\b", combined.lower()):
        return "signature", combined
    if EMAIL_RE.search(combined) or URL_RE.search(combined) or PHONE_RE.search(combined) or CIN_RE.search(combined):
        return "footer", combined
    return "paragraph", combined

def extract_images_from_page(page, embed_images):
    """
    Extract images from a PyMuPDF page.
    Returns list of dicts: {"type":"chart","description":...,"image_b64":...}
    """
    imgs = []
    image_list = page.get_images(full=True)
    for img_index, img in enumerate(image_list, start=1):
        xref = img[0]
        try:
            pix = fitz.Pixmap(page.parent, xref)
            if pix.n - pix.alpha >= 4:  # e.g., CMYK
                pix = fitz.Pixmap(fitz.csRGB, pix)
            img_bytes = pix.tobytes("png")

            img_entry = {
                "type": "chart",
                "description": f"Image {img_index} on page {page.number + 1}",
            }
            if embed_images:
                img_entry["image_b64"] = image_bytes_to_base64(img_bytes, mime="image/png")
            imgs.append(img_entry)

            pix = None  # free memory
        except Exception as e:
            print(f"⚠️ Could not extract image {img_index} on page {page.number+1}: {e}")
            continue
    return imgs


def ocr_image_bytes(img_b64):
    """
    If OCR available, decode base64 and run OCR to extract text.
    Returns OCR text or None.
    """
    if not OCR_AVAILABLE:
        return None
    header, data = img_b64.split(",", 1)
    img_bytes = base64.b64decode(data)
    im = Image.open(BytesIO(img_bytes)).convert("RGB")
    text = pytesseract.image_to_string(im)
    return text.strip()

def extract_pdf_content(pdf_path, embed_images=False, do_ocr_images=False):
    """
    Main extraction pipeline:
    - Uses PyMuPDF for text with spans/size metadata (section/subsection detection)
    - Uses Camelot for tables
    - Detects images and optionally embeds them
    - Classifies signature/footer blocks
    """
    result = {"pages": []}
    doc = fitz.open(pdf_path)
    # Pre-open pdfplumber for alternate text extraction if needed
    plumber_doc = pdfplumber.open(pdf_path)

    for page_index in range(len(doc)):
        page = doc[page_index]
        page_number = page_index + 1
        page_entry = {"page_number": page_number, "content": []}

        # --- Collect spans for heuristics ---
        # each span: (text, size, flags, font)
        spans = []
        blocks = page.get_text("dict").get("blocks", [])
        for block in blocks:
            if "lines" not in block:
                continue
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span.get("text", "").strip()
                    size = span.get("size", 0)
                    flags = span.get("flags", 0)
                    font = span.get("font", "")
                    if text:
                        spans.append((text, size, flags, font))

        section_t, subsection_t = detect_headings(spans)

        # --- Walk blocks and create paragraphs or headings ---
        current_section = None
        current_subsection = None
        # We'll group by block for better paragraph sense
        for block in blocks:
            if "lines" not in block:
                continue
            block_lines = []
            # For each line, decide if it's heading/subheading/paragraph
            for line in block["lines"]:
                # join spans of the line preserving style info
                line_spans = line.get("spans", [])
                if not line_spans:
                    continue
                # Determine the largest font size in the line
                sizes = [s.get("size", 0) for s in line_spans if s.get("text", "").strip()]
                if not sizes:
                    continue
                max_size = max(sizes)
                text_line = " ".join(s.get("text", "").strip() for s in line_spans).strip()
                if not text_line:
                    continue

                # Heading heuristics
                if max_size >= section_t and (text_line.isupper() or len(text_line.split()) <= 6):
                    # Section heading
                    current_section = text_line
                    current_subsection = None
                    page_entry["content"].append({
                        "type": "section",
                        "section": current_section,
                        "sub_section": None,
                        "text": None
                    })
                elif max_size >= subsection_t and (len(text_line.split()) <= 8):
                    current_subsection = text_line
                    page_entry["content"].append({
                        "type": "sub_section",
                        "section": current_section,
                        "sub_section": current_subsection,
                        "text": None
                    })
                else:
                    block_lines.append(text_line)

            if block_lines:
                # Try to classify block (footer/signature) heuristics
                btype, combined = classify_footer_and_signature(block_lines)
                if btype == "signature":
                    page_entry["content"].append({
                        "type": "signature",
                        "section": current_section,
                        "sub_section": current_subsection,
                        "text": combined
                    })
                elif btype == "footer":
                    page_entry["content"].append({
                        "type": "footer",
                        "section": current_section,
                        "sub_section": current_subsection,
                        "text": combined
                    })
                else:
                    # regular paragraph
                    page_entry["content"].append({
                        "type": "paragraph",
                        "section": current_section,
                        "sub_section": current_subsection,
                        "text": combined
                    })

        # --- Camelot tables for this page ---
        try:
            tables = camelot.read_pdf(pdf_path, pages=str(page_number))
            for idx, table in enumerate(tables, start=1):
                table_data = table.df.values.tolist()
                page_entry["content"].append({
                    "type": "table",
                    "section": current_section,
                    "sub_section": current_subsection,
                    "description": f"Table {idx} on page {page_number}",
                    "table_data": table_data
                })
        except Exception:
            # camelot may raise when no tables or not supported; ignore
            pass

        # --- Images / Charts detection ---
        images = extract_images_from_page(page, embed_images)
        # If OCR on images requested, attempt to extract text
        if do_ocr_images and OCR_AVAILABLE:
            for img in images:
                if "image_b64" in img:
                    ocr_text = ocr_image_bytes(img["image_b64"])
                    if ocr_text:
                        img["ocr_text"] = ocr_text
        # Append images as chart entries
        for img in images:
            page_entry["content"].append(img)

        # If pdfplumber can find elements (fallback), add any missing text blocks (optional)
        # (Skipping to avoid duplication — pdfplumber often duplicates fitz results.)

        result["pages"].append(page_entry)

    plumber_doc.close()
    doc.close()
    return result

# ---------------- Streamlit App UI ----------------
st.set_page_config(page_title="PDF → Structured JSON (Robust)", layout="wide")
st.title("PDF Parsing and Structured JSON Extraction")

st.markdown(
    """
Upload a PDF and the app will:
- detect sections/subsections by font-size heuristics,
- extract paragraphs and group them,
- extract tables (Camelot),
- detect images/charts and optionally embed them (base64),
- identify signature/footer/contact blocks,
- optionally OCR text inside images (Tesseract required).
"""
)

uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
    embed_images = st.checkbox("Embed images (base64) into JSON", value=False)
with col2:
    do_ocr_images = st.checkbox("Run OCR on images (pytesseract)", value=False)
with col3:
    pretty = st.checkbox("Pretty-print JSON preview", value=True)

if do_ocr_images and not OCR_AVAILABLE:
    st.warning("pytesseract or PIL not available in environment — OCR disabled. Install pytesseract and Tesseract engine.")

if uploaded_file is not None:
    # Save to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        tmp.write(uploaded_file.read())
        tmp_path = tmp.name

    st.info(f"Saved uploaded PDF to `{tmp_path}`")

    if st.button("Extract → JSON"):
        try:
            with st.spinner("Extracting..."):
                json_data = extract_pdf_content(tmp_path, embed_images=embed_images, do_ocr_images=do_ocr_images)

            st.success("Extraction complete ✅")

            # JSON preview
            if pretty:
                st.json(json_data)
            else:
                st.code(json.dumps(json_data, ensure_ascii=False))

            # Offer download of JSON
            json_bytes = json.dumps(json_data, indent=2, ensure_ascii=False).encode("utf-8")
            st.download_button("⬇️ Download JSON", data=json_bytes, file_name="extracted.json", mime="application/json")

            # If images embedded, show thumbnails (first page few)
            if embed_images:
                shown = 0
                st.write("Extracted Images (embedded):")
                for p in json_data["pages"]:
                    for content in p["content"]:
                        if content.get("type") == "chart" and content.get("image_b64"):
                            st.image(content["image_b64"], width=300)
                            shown += 1
                            if shown >= 6:
                                break
                    if shown >= 6:
                        break

        except Exception as e:
            st.error(f"Extraction failed: {e}")
            st.exception(e)

    # Cleanup temp file if desired (keep for debugging)
    # os.remove(tmp_path)
else:
    st.info("Upload a PDF to begin.")

st.markdown("---")
st.markdown("**Notes / Requirements**:")
st.markdown(
    """
- **Camelot** requires Ghostscript and a compatible environment (works best with Linux).
- **pytesseract** requires the Tesseract engine installed on your system.
- Embedding images as base64 increases JSON size considerably; disable embedding if you only need metadata.
- The heuristics (font-size thresholds, regexes) are conservative — you may need to tweak thresholds for certain document families.
"""
)