pdf-toolbox / utils /pdf_processing.py
codemaker2015's picture
first commit
1e9ae83
import os
import io
import zipfile
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import pdfplumber
import camelot
from PyPDF2 import PdfReader, PdfWriter
# -------------------------
# BASIC PDF TOOLS (your originals)
# -------------------------
def split_pdf_pages(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
"""
Split selected pages into separate PDFs and return a ZIP (in-memory).
start_page/end_page are 1-indexed (inclusive).
"""
reader = PdfReader(pdf_path)
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as zipf:
for i in range(start_page, end_page + 1):
writer = PdfWriter()
writer.add_page(reader.pages[i - 1])
pdf_bytes = io.BytesIO()
writer.write(pdf_bytes)
pdf_bytes.seek(0)
zipf.writestr(f"page_{i}.pdf", pdf_bytes.read())
zip_buffer.seek(0)
return zip_buffer
def merge_pdfs(files_or_paths) -> io.BytesIO:
"""
Merge multiple PDFs. Accepts a list of file-like objects or file paths.
Returns merged PDF as BytesIO.
"""
writer = PdfWriter()
for f in files_or_paths:
reader = PdfReader(f) if hasattr(f, "read") else PdfReader(str(f))
for page in reader.pages:
writer.add_page(page)
out = io.BytesIO()
writer.write(out)
out.seek(0)
return out
def extract_page_range(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
"""Extract a page range (1-indexed, inclusive) into a single PDF (in-memory)."""
reader = PdfReader(pdf_path)
writer = PdfWriter()
for i in range(start_page, end_page + 1):
writer.add_page(reader.pages[i - 1])
out = io.BytesIO()
writer.write(out)
out.seek(0)
return out
def remove_first_last_pages(pdf_path: str, remove_first: bool, remove_last: bool) -> io.BytesIO:
"""Remove first and/or last page and return modified PDF (in-memory)."""
reader = PdfReader(pdf_path)
writer = PdfWriter()
total = len(reader.pages)
for i in range(total):
if (remove_first and i == 0) or (remove_last and i == total - 1):
continue
writer.add_page(reader.pages[i])
out = io.BytesIO()
writer.write(out)
out.seek(0)
return out
# -------------------------
# ADVANCED UTILITIES
# -------------------------
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extract selectable text (not OCR) via PyMuPDF."""
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
def keyword_highlight_pdf(pdf_path: str, keyword: str, output_path: str = "highlighted.pdf") -> str:
"""Search keyword and highlight occurrences in the PDF (case-insensitive)."""
doc = fitz.open(pdf_path)
for page in doc:
matches = page.search_for(keyword, quads=False)
for rect in matches:
page.add_highlight_annot(rect)
doc.save(output_path, garbage=4, deflate=True)
return output_path
def extract_images(pdf_path: str, output_folder: str = "extracted_images") -> list[str]:
"""Extract embedded images to a folder; returns list of saved image paths."""
os.makedirs(output_folder, exist_ok=True)
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = os.path.join(
output_folder,
f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
)
with open(image_filename, "wb") as f:
f.write(image_bytes)
return output_folder
def extract_tables(pdf_path: str):
"""
Try Camelot first; fall back to pdfplumber.
Returns list of DataFrames (Camelot) or list-of-rows tables (pdfplumber).
"""
try:
tables = camelot.read_pdf(pdf_path, pages="all")
if tables.n > 0:
return [t.df for t in tables]
except Exception:
pass
results = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_tables = page.extract_tables()
results.extend(page_tables or [])
return results
def ocr_pdf(pdf_path: str, lang: str = "eng") -> str:
"""OCR image-only pages via Tesseract and PyMuPDF rasterization."""
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes("png")))
custom_config = r'--oem 3 --psm 6'
text += pytesseract.image_to_string(img, lang=lang, config=custom_config) + "\n"
return text
def reorder_pages(pdf_path: str, new_order: list[int], output_path: str = "reordered.pdf") -> str:
"""Reorder pages by 0-indexed positions. Saves to output_path."""
src = fitz.open(pdf_path)
dst = fitz.open()
for i in new_order:
dst.insert_pdf(src, from_page=i, to_page=i)
dst.save(output_path)
return output_path
def rotate_pages(pdf_path: str, pages_to_rotate: list[int], angle: int, output_path: str = "rotated.pdf") -> str:
"""Rotate selected 0-indexed pages by angle (e.g., 90/180/270)."""
doc = fitz.open(pdf_path)
for p in pages_to_rotate:
doc[p].set_rotation(angle)
doc.save(output_path)
return output_path
def add_watermark(pdf_path: str, watermark_text: str, output_path: str = "watermarked.pdf") -> str:
"""Add semi-transparent diagonal text watermark to all pages."""
doc = fitz.open(pdf_path)
for page in doc:
rect = page.rect
page.insert_text(
(rect.width * 0.25, rect.height * 0.5),
watermark_text,
fontsize=30,
rotate=0,
color=(0.59, 0.59, 0.59)
)
doc.save(output_path)
return output_path
def extract_metadata(pdf_path: str) -> dict:
"""Return PDF metadata dictionary."""
with fitz.open(pdf_path) as doc:
return doc.metadata or {}