Spaces:
Running
Running
File size: 6,228 Bytes
1e9ae83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import os
import io
import zipfile
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import pdfplumber
import camelot
from PyPDF2 import PdfReader, PdfWriter
# -------------------------
# BASIC PDF TOOLS (your originals)
# -------------------------
def split_pdf_pages(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
"""
Split selected pages into separate PDFs and return a ZIP (in-memory).
start_page/end_page are 1-indexed (inclusive).
"""
reader = PdfReader(pdf_path)
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as zipf:
for i in range(start_page, end_page + 1):
writer = PdfWriter()
writer.add_page(reader.pages[i - 1])
pdf_bytes = io.BytesIO()
writer.write(pdf_bytes)
pdf_bytes.seek(0)
zipf.writestr(f"page_{i}.pdf", pdf_bytes.read())
zip_buffer.seek(0)
return zip_buffer
def merge_pdfs(files_or_paths) -> io.BytesIO:
"""
Merge multiple PDFs. Accepts a list of file-like objects or file paths.
Returns merged PDF as BytesIO.
"""
writer = PdfWriter()
for f in files_or_paths:
reader = PdfReader(f) if hasattr(f, "read") else PdfReader(str(f))
for page in reader.pages:
writer.add_page(page)
out = io.BytesIO()
writer.write(out)
out.seek(0)
return out
def extract_page_range(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
"""Extract a page range (1-indexed, inclusive) into a single PDF (in-memory)."""
reader = PdfReader(pdf_path)
writer = PdfWriter()
for i in range(start_page, end_page + 1):
writer.add_page(reader.pages[i - 1])
out = io.BytesIO()
writer.write(out)
out.seek(0)
return out
def remove_first_last_pages(pdf_path: str, remove_first: bool, remove_last: bool) -> io.BytesIO:
"""Remove first and/or last page and return modified PDF (in-memory)."""
reader = PdfReader(pdf_path)
writer = PdfWriter()
total = len(reader.pages)
for i in range(total):
if (remove_first and i == 0) or (remove_last and i == total - 1):
continue
writer.add_page(reader.pages[i])
out = io.BytesIO()
writer.write(out)
out.seek(0)
return out
# -------------------------
# ADVANCED UTILITIES
# -------------------------
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extract selectable text (not OCR) via PyMuPDF."""
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
def keyword_highlight_pdf(pdf_path: str, keyword: str, output_path: str = "highlighted.pdf") -> str:
"""Search keyword and highlight occurrences in the PDF (case-insensitive)."""
doc = fitz.open(pdf_path)
for page in doc:
matches = page.search_for(keyword, quads=False)
for rect in matches:
page.add_highlight_annot(rect)
doc.save(output_path, garbage=4, deflate=True)
return output_path
def extract_images(pdf_path: str, output_folder: str = "extracted_images") -> list[str]:
"""Extract embedded images to a folder; returns list of saved image paths."""
os.makedirs(output_folder, exist_ok=True)
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = os.path.join(
output_folder,
f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
)
with open(image_filename, "wb") as f:
f.write(image_bytes)
return output_folder
def extract_tables(pdf_path: str):
"""
Try Camelot first; fall back to pdfplumber.
Returns list of DataFrames (Camelot) or list-of-rows tables (pdfplumber).
"""
try:
tables = camelot.read_pdf(pdf_path, pages="all")
if tables.n > 0:
return [t.df for t in tables]
except Exception:
pass
results = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_tables = page.extract_tables()
results.extend(page_tables or [])
return results
def ocr_pdf(pdf_path: str, lang: str = "eng") -> str:
"""OCR image-only pages via Tesseract and PyMuPDF rasterization."""
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes("png")))
custom_config = r'--oem 3 --psm 6'
text += pytesseract.image_to_string(img, lang=lang, config=custom_config) + "\n"
return text
def reorder_pages(pdf_path: str, new_order: list[int], output_path: str = "reordered.pdf") -> str:
"""Reorder pages by 0-indexed positions. Saves to output_path."""
src = fitz.open(pdf_path)
dst = fitz.open()
for i in new_order:
dst.insert_pdf(src, from_page=i, to_page=i)
dst.save(output_path)
return output_path
def rotate_pages(pdf_path: str, pages_to_rotate: list[int], angle: int, output_path: str = "rotated.pdf") -> str:
"""Rotate selected 0-indexed pages by angle (e.g., 90/180/270)."""
doc = fitz.open(pdf_path)
for p in pages_to_rotate:
doc[p].set_rotation(angle)
doc.save(output_path)
return output_path
def add_watermark(pdf_path: str, watermark_text: str, output_path: str = "watermarked.pdf") -> str:
"""Add semi-transparent diagonal text watermark to all pages."""
doc = fitz.open(pdf_path)
for page in doc:
rect = page.rect
page.insert_text(
(rect.width * 0.25, rect.height * 0.5),
watermark_text,
fontsize=30,
rotate=0,
color=(0.59, 0.59, 0.59)
)
doc.save(output_path)
return output_path
def extract_metadata(pdf_path: str) -> dict:
"""Return PDF metadata dictionary."""
with fitz.open(pdf_path) as doc:
return doc.metadata or {}
|