lithish2602's picture
Upload 2 files
350f8c3 verified
Raw
History Blame Contribute Delete
13.1 kB
# app.py
import streamlit as st
import fitz # PyMuPDF
import pdfplumber
import camelot
import json
import tempfile
import os
import re
import base64
from io import BytesIO
from statistics import mean, pstdev
# Optional OCR
try:
import pytesseract
from PIL import Image
OCR_AVAILABLE = True
except Exception:
OCR_AVAILABLE = False
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(\+?\d{1,3})?[\s\-.(]*(\d{2,4})[\s\-.)]*(\d{3,4})[\s\-]*(\d{3,4})")
URL_RE = re.compile(r"(https?://\S+|www\.\S+)")
CIN_RE = re.compile(r"\bCIN\b.*", flags=re.IGNORECASE)
def image_bytes_to_base64(img_bytes, mime="image/png"):
b64 = base64.b64encode(img_bytes).decode("utf-8")
return f"data:{mime};base64,{b64}"
def detect_headings(spans):
"""
Heuristic detection of sections/subsections using font sizes in spans.
spans: list of (text, size, flags, font)
Returns thresholds (section_threshold, subsection_threshold)
"""
sizes = [s for (_, s, _, _) in spans if s > 0]
if not sizes:
return (16, 12)
avg = mean(sizes)
sd = pstdev(sizes) if len(sizes) > 1 else 0
# Section threshold: avg + 1*sd or at least 14
section_t = max(14, avg + sd)
subsection_t = max(11, avg)
return (section_t, subsection_t)
def classify_footer_and_signature(lines):
"""
Given list of lines (strings) attempt to classify footer, signature, or normal.
Returns (type, combined_text) where type in {"footer","signature","paragraph"}.
"""
combined = "\n".join(lines).strip()
# Look for signature clues
if any(x in combined.lower() for x in ["yours sincerely", "yours faithfully", "for "]) or re.search(r"\b(dean|director|manager|ceo|coo)\b", combined.lower()):
return "signature", combined
if EMAIL_RE.search(combined) or URL_RE.search(combined) or PHONE_RE.search(combined) or CIN_RE.search(combined):
return "footer", combined
return "paragraph", combined
def extract_images_from_page(page, embed_images):
"""
Extract images from a PyMuPDF page.
Returns list of dicts: {"type":"chart","description":...,"image_b64":...}
"""
imgs = []
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list, start=1):
xref = img[0]
try:
pix = fitz.Pixmap(page.parent, xref)
if pix.n - pix.alpha >= 4: # e.g., CMYK
pix = fitz.Pixmap(fitz.csRGB, pix)
img_bytes = pix.tobytes("png")
img_entry = {
"type": "chart",
"description": f"Image {img_index} on page {page.number + 1}",
}
if embed_images:
img_entry["image_b64"] = image_bytes_to_base64(img_bytes, mime="image/png")
imgs.append(img_entry)
pix = None # free memory
except Exception as e:
print(f"⚠️ Could not extract image {img_index} on page {page.number+1}: {e}")
continue
return imgs
def ocr_image_bytes(img_b64):
"""
If OCR available, decode base64 and run OCR to extract text.
Returns OCR text or None.
"""
if not OCR_AVAILABLE:
return None
header, data = img_b64.split(",", 1)
img_bytes = base64.b64decode(data)
im = Image.open(BytesIO(img_bytes)).convert("RGB")
text = pytesseract.image_to_string(im)
return text.strip()
def extract_pdf_content(pdf_path, embed_images=False, do_ocr_images=False):
"""
Main extraction pipeline:
- Uses PyMuPDF for text with spans/size metadata (section/subsection detection)
- Uses Camelot for tables
- Detects images and optionally embeds them
- Classifies signature/footer blocks
"""
result = {"pages": []}
doc = fitz.open(pdf_path)
# Pre-open pdfplumber for alternate text extraction if needed
plumber_doc = pdfplumber.open(pdf_path)
for page_index in range(len(doc)):
page = doc[page_index]
page_number = page_index + 1
page_entry = {"page_number": page_number, "content": []}
# --- Collect spans for heuristics ---
# each span: (text, size, flags, font)
spans = []
blocks = page.get_text("dict").get("blocks", [])
for block in blocks:
if "lines" not in block:
continue
for line in block["lines"]:
for span in line["spans"]:
text = span.get("text", "").strip()
size = span.get("size", 0)
flags = span.get("flags", 0)
font = span.get("font", "")
if text:
spans.append((text, size, flags, font))
section_t, subsection_t = detect_headings(spans)
# --- Walk blocks and create paragraphs or headings ---
current_section = None
current_subsection = None
# We'll group by block for better paragraph sense
for block in blocks:
if "lines" not in block:
continue
block_lines = []
# For each line, decide if it's heading/subheading/paragraph
for line in block["lines"]:
# join spans of the line preserving style info
line_spans = line.get("spans", [])
if not line_spans:
continue
# Determine the largest font size in the line
sizes = [s.get("size", 0) for s in line_spans if s.get("text", "").strip()]
if not sizes:
continue
max_size = max(sizes)
text_line = " ".join(s.get("text", "").strip() for s in line_spans).strip()
if not text_line:
continue
# Heading heuristics
if max_size >= section_t and (text_line.isupper() or len(text_line.split()) <= 6):
# Section heading
current_section = text_line
current_subsection = None
page_entry["content"].append({
"type": "section",
"section": current_section,
"sub_section": None,
"text": None
})
elif max_size >= subsection_t and (len(text_line.split()) <= 8):
current_subsection = text_line
page_entry["content"].append({
"type": "sub_section",
"section": current_section,
"sub_section": current_subsection,
"text": None
})
else:
block_lines.append(text_line)
if block_lines:
# Try to classify block (footer/signature) heuristics
btype, combined = classify_footer_and_signature(block_lines)
if btype == "signature":
page_entry["content"].append({
"type": "signature",
"section": current_section,
"sub_section": current_subsection,
"text": combined
})
elif btype == "footer":
page_entry["content"].append({
"type": "footer",
"section": current_section,
"sub_section": current_subsection,
"text": combined
})
else:
# regular paragraph
page_entry["content"].append({
"type": "paragraph",
"section": current_section,
"sub_section": current_subsection,
"text": combined
})
# --- Camelot tables for this page ---
try:
tables = camelot.read_pdf(pdf_path, pages=str(page_number))
for idx, table in enumerate(tables, start=1):
table_data = table.df.values.tolist()
page_entry["content"].append({
"type": "table",
"section": current_section,
"sub_section": current_subsection,
"description": f"Table {idx} on page {page_number}",
"table_data": table_data
})
except Exception:
# camelot may raise when no tables or not supported; ignore
pass
# --- Images / Charts detection ---
images = extract_images_from_page(page, embed_images)
# If OCR on images requested, attempt to extract text
if do_ocr_images and OCR_AVAILABLE:
for img in images:
if "image_b64" in img:
ocr_text = ocr_image_bytes(img["image_b64"])
if ocr_text:
img["ocr_text"] = ocr_text
# Append images as chart entries
for img in images:
page_entry["content"].append(img)
# If pdfplumber can find elements (fallback), add any missing text blocks (optional)
# (Skipping to avoid duplication — pdfplumber often duplicates fitz results.)
result["pages"].append(page_entry)
plumber_doc.close()
doc.close()
return result
# ---------------- Streamlit App UI ----------------
st.set_page_config(page_title="PDF → Structured JSON (Robust)", layout="wide")
st.title("PDF Parsing and Structured JSON Extraction")
st.markdown(
"""
Upload a PDF and the app will:
- detect sections/subsections by font-size heuristics,
- extract paragraphs and group them,
- extract tables (Camelot),
- detect images/charts and optionally embed them (base64),
- identify signature/footer/contact blocks,
- optionally OCR text inside images (Tesseract required).
"""
)
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
embed_images = st.checkbox("Embed images (base64) into JSON", value=False)
with col2:
do_ocr_images = st.checkbox("Run OCR on images (pytesseract)", value=False)
with col3:
pretty = st.checkbox("Pretty-print JSON preview", value=True)
if do_ocr_images and not OCR_AVAILABLE:
st.warning("pytesseract or PIL not available in environment — OCR disabled. Install pytesseract and Tesseract engine.")
if uploaded_file is not None:
# Save to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(uploaded_file.read())
tmp_path = tmp.name
st.info(f"Saved uploaded PDF to `{tmp_path}`")
if st.button("Extract → JSON"):
try:
with st.spinner("Extracting..."):
json_data = extract_pdf_content(tmp_path, embed_images=embed_images, do_ocr_images=do_ocr_images)
st.success("Extraction complete ✅")
# JSON preview
if pretty:
st.json(json_data)
else:
st.code(json.dumps(json_data, ensure_ascii=False))
# Offer download of JSON
json_bytes = json.dumps(json_data, indent=2, ensure_ascii=False).encode("utf-8")
st.download_button("⬇️ Download JSON", data=json_bytes, file_name="extracted.json", mime="application/json")
# If images embedded, show thumbnails (first page few)
if embed_images:
shown = 0
st.write("Extracted Images (embedded):")
for p in json_data["pages"]:
for content in p["content"]:
if content.get("type") == "chart" and content.get("image_b64"):
st.image(content["image_b64"], width=300)
shown += 1
if shown >= 6:
break
if shown >= 6:
break
except Exception as e:
st.error(f"Extraction failed: {e}")
st.exception(e)
# Cleanup temp file if desired (keep for debugging)
# os.remove(tmp_path)
else:
st.info("Upload a PDF to begin.")
st.markdown("---")
st.markdown("**Notes / Requirements**:")
st.markdown(
"""
- **Camelot** requires Ghostscript and a compatible environment (works best with Linux).
- **pytesseract** requires the Tesseract engine installed on your system.
- Embedding images as base64 increases JSON size considerably; disable embedding if you only need metadata.
- The heuristics (font-size thresholds, regexes) are conservative — you may need to tweak thresholds for certain document families.
"""
)