Spaces:
Running
Running
import os | |
import io | |
import zipfile | |
import fitz # PyMuPDF | |
from PIL import Image | |
import pytesseract | |
import pdfplumber | |
import camelot | |
from PyPDF2 import PdfReader, PdfWriter | |
# ------------------------- | |
# BASIC PDF TOOLS (your originals) | |
# ------------------------- | |
def split_pdf_pages(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO: | |
""" | |
Split selected pages into separate PDFs and return a ZIP (in-memory). | |
start_page/end_page are 1-indexed (inclusive). | |
""" | |
reader = PdfReader(pdf_path) | |
zip_buffer = io.BytesIO() | |
with zipfile.ZipFile(zip_buffer, "w") as zipf: | |
for i in range(start_page, end_page + 1): | |
writer = PdfWriter() | |
writer.add_page(reader.pages[i - 1]) | |
pdf_bytes = io.BytesIO() | |
writer.write(pdf_bytes) | |
pdf_bytes.seek(0) | |
zipf.writestr(f"page_{i}.pdf", pdf_bytes.read()) | |
zip_buffer.seek(0) | |
return zip_buffer | |
def merge_pdfs(files_or_paths) -> io.BytesIO: | |
""" | |
Merge multiple PDFs. Accepts a list of file-like objects or file paths. | |
Returns merged PDF as BytesIO. | |
""" | |
writer = PdfWriter() | |
for f in files_or_paths: | |
reader = PdfReader(f) if hasattr(f, "read") else PdfReader(str(f)) | |
for page in reader.pages: | |
writer.add_page(page) | |
out = io.BytesIO() | |
writer.write(out) | |
out.seek(0) | |
return out | |
def extract_page_range(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO: | |
"""Extract a page range (1-indexed, inclusive) into a single PDF (in-memory).""" | |
reader = PdfReader(pdf_path) | |
writer = PdfWriter() | |
for i in range(start_page, end_page + 1): | |
writer.add_page(reader.pages[i - 1]) | |
out = io.BytesIO() | |
writer.write(out) | |
out.seek(0) | |
return out | |
def remove_first_last_pages(pdf_path: str, remove_first: bool, remove_last: bool) -> io.BytesIO: | |
"""Remove first and/or last page and return modified PDF (in-memory).""" | |
reader = PdfReader(pdf_path) | |
writer = PdfWriter() | |
total = len(reader.pages) | |
for i in range(total): | |
if (remove_first and i == 0) or (remove_last and i == total - 1): | |
continue | |
writer.add_page(reader.pages[i]) | |
out = io.BytesIO() | |
writer.write(out) | |
out.seek(0) | |
return out | |
# ------------------------- | |
# ADVANCED UTILITIES | |
# ------------------------- | |
def extract_text_from_pdf(pdf_path: str) -> str: | |
"""Extract selectable text (not OCR) via PyMuPDF.""" | |
text = "" | |
with fitz.open(pdf_path) as doc: | |
for page in doc: | |
text += page.get_text() | |
return text | |
def keyword_highlight_pdf(pdf_path: str, keyword: str, output_path: str = "highlighted.pdf") -> str: | |
"""Search keyword and highlight occurrences in the PDF (case-insensitive).""" | |
doc = fitz.open(pdf_path) | |
for page in doc: | |
matches = page.search_for(keyword, quads=False) | |
for rect in matches: | |
page.add_highlight_annot(rect) | |
doc.save(output_path, garbage=4, deflate=True) | |
return output_path | |
def extract_images(pdf_path: str, output_folder: str = "extracted_images") -> list[str]: | |
"""Extract embedded images to a folder; returns list of saved image paths.""" | |
os.makedirs(output_folder, exist_ok=True) | |
doc = fitz.open(pdf_path) | |
for page_num in range(len(doc)): | |
page = doc[page_num] | |
images = page.get_images(full=True) | |
for img_index, img in enumerate(images): | |
xref = img[0] | |
base_image = doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
image_ext = base_image["ext"] | |
image_filename = os.path.join( | |
output_folder, | |
f"page_{page_num+1}_img_{img_index+1}.{image_ext}" | |
) | |
with open(image_filename, "wb") as f: | |
f.write(image_bytes) | |
return output_folder | |
def extract_tables(pdf_path: str): | |
""" | |
Try Camelot first; fall back to pdfplumber. | |
Returns list of DataFrames (Camelot) or list-of-rows tables (pdfplumber). | |
""" | |
try: | |
tables = camelot.read_pdf(pdf_path, pages="all") | |
if tables.n > 0: | |
return [t.df for t in tables] | |
except Exception: | |
pass | |
results = [] | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
page_tables = page.extract_tables() | |
results.extend(page_tables or []) | |
return results | |
def ocr_pdf(pdf_path: str, lang: str = "eng") -> str: | |
"""OCR image-only pages via Tesseract and PyMuPDF rasterization.""" | |
text = "" | |
with fitz.open(pdf_path) as doc: | |
for page in doc: | |
pix = page.get_pixmap() | |
img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
custom_config = r'--oem 3 --psm 6' | |
text += pytesseract.image_to_string(img, lang=lang, config=custom_config) + "\n" | |
return text | |
def reorder_pages(pdf_path: str, new_order: list[int], output_path: str = "reordered.pdf") -> str: | |
"""Reorder pages by 0-indexed positions. Saves to output_path.""" | |
src = fitz.open(pdf_path) | |
dst = fitz.open() | |
for i in new_order: | |
dst.insert_pdf(src, from_page=i, to_page=i) | |
dst.save(output_path) | |
return output_path | |
def rotate_pages(pdf_path: str, pages_to_rotate: list[int], angle: int, output_path: str = "rotated.pdf") -> str: | |
"""Rotate selected 0-indexed pages by angle (e.g., 90/180/270).""" | |
doc = fitz.open(pdf_path) | |
for p in pages_to_rotate: | |
doc[p].set_rotation(angle) | |
doc.save(output_path) | |
return output_path | |
def add_watermark(pdf_path: str, watermark_text: str, output_path: str = "watermarked.pdf") -> str: | |
"""Add semi-transparent diagonal text watermark to all pages.""" | |
doc = fitz.open(pdf_path) | |
for page in doc: | |
rect = page.rect | |
page.insert_text( | |
(rect.width * 0.25, rect.height * 0.5), | |
watermark_text, | |
fontsize=30, | |
rotate=0, | |
color=(0.59, 0.59, 0.59) | |
) | |
doc.save(output_path) | |
return output_path | |
def extract_metadata(pdf_path: str) -> dict: | |
"""Return PDF metadata dictionary.""" | |
with fitz.open(pdf_path) as doc: | |
return doc.metadata or {} | |