File size: 6,228 Bytes
1e9ae83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
import io
import zipfile
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import pdfplumber
import camelot
from PyPDF2 import PdfReader, PdfWriter


# -------------------------
# BASIC PDF TOOLS (your originals)
# -------------------------

def split_pdf_pages(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
    """
    Split selected pages into separate PDFs and return a ZIP (in-memory).
    start_page/end_page are 1-indexed (inclusive).
    """
    reader = PdfReader(pdf_path)
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w") as zipf:
        for i in range(start_page, end_page + 1):
            writer = PdfWriter()
            writer.add_page(reader.pages[i - 1])
            pdf_bytes = io.BytesIO()
            writer.write(pdf_bytes)
            pdf_bytes.seek(0)
            zipf.writestr(f"page_{i}.pdf", pdf_bytes.read())
    zip_buffer.seek(0)
    return zip_buffer


def merge_pdfs(files_or_paths) -> io.BytesIO:
    """
    Merge multiple PDFs. Accepts a list of file-like objects or file paths.
    Returns merged PDF as BytesIO.
    """
    writer = PdfWriter()
    for f in files_or_paths:
        reader = PdfReader(f) if hasattr(f, "read") else PdfReader(str(f))
        for page in reader.pages:
            writer.add_page(page)
    out = io.BytesIO()
    writer.write(out)
    out.seek(0)
    return out


def extract_page_range(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
    """Extract a page range (1-indexed, inclusive) into a single PDF (in-memory)."""
    reader = PdfReader(pdf_path)
    writer = PdfWriter()
    for i in range(start_page, end_page + 1):
        writer.add_page(reader.pages[i - 1])
    out = io.BytesIO()
    writer.write(out)
    out.seek(0)
    return out


def remove_first_last_pages(pdf_path: str, remove_first: bool, remove_last: bool) -> io.BytesIO:
    """Remove first and/or last page and return modified PDF (in-memory)."""
    reader = PdfReader(pdf_path)
    writer = PdfWriter()
    total = len(reader.pages)
    for i in range(total):
        if (remove_first and i == 0) or (remove_last and i == total - 1):
            continue
        writer.add_page(reader.pages[i])
    out = io.BytesIO()
    writer.write(out)
    out.seek(0)
    return out


# -------------------------
# ADVANCED UTILITIES
# -------------------------

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract selectable text (not OCR) via PyMuPDF."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text


def keyword_highlight_pdf(pdf_path: str, keyword: str, output_path: str = "highlighted.pdf") -> str:
    """Search keyword and highlight occurrences in the PDF (case-insensitive)."""
    doc = fitz.open(pdf_path)
    for page in doc:
        matches = page.search_for(keyword, quads=False)
        for rect in matches:
            page.add_highlight_annot(rect)
    doc.save(output_path, garbage=4, deflate=True)
    return output_path


def extract_images(pdf_path: str, output_folder: str = "extracted_images") -> list[str]:
    """Extract embedded images to a folder; returns list of saved image paths."""
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc[page_num]
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = os.path.join(
                output_folder,
                f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
            )
            with open(image_filename, "wb") as f:
                f.write(image_bytes)

    return output_folder


def extract_tables(pdf_path: str):
    """
    Try Camelot first; fall back to pdfplumber.
    Returns list of DataFrames (Camelot) or list-of-rows tables (pdfplumber).
    """
    try:
        tables = camelot.read_pdf(pdf_path, pages="all")
        if tables.n > 0:
            return [t.df for t in tables]
    except Exception:
        pass

    results = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_tables = page.extract_tables()
            results.extend(page_tables or [])
    return results


def ocr_pdf(pdf_path: str, lang: str = "eng") -> str:
    """OCR image-only pages via Tesseract and PyMuPDF rasterization."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            custom_config = r'--oem 3 --psm 6'
            text += pytesseract.image_to_string(img, lang=lang, config=custom_config) + "\n"
    return text


def reorder_pages(pdf_path: str, new_order: list[int], output_path: str = "reordered.pdf") -> str:
    """Reorder pages by 0-indexed positions. Saves to output_path."""
    src = fitz.open(pdf_path)
    dst = fitz.open()
    for i in new_order:
        dst.insert_pdf(src, from_page=i, to_page=i)
    dst.save(output_path)
    return output_path


def rotate_pages(pdf_path: str, pages_to_rotate: list[int], angle: int, output_path: str = "rotated.pdf") -> str:
    """Rotate selected 0-indexed pages by angle (e.g., 90/180/270)."""
    doc = fitz.open(pdf_path)
    for p in pages_to_rotate:
        doc[p].set_rotation(angle)
    doc.save(output_path)
    return output_path


def add_watermark(pdf_path: str, watermark_text: str, output_path: str = "watermarked.pdf") -> str:
    """Add semi-transparent diagonal text watermark to all pages."""
    doc = fitz.open(pdf_path)
    for page in doc:
        rect = page.rect
        page.insert_text(
            (rect.width * 0.25, rect.height * 0.5),
            watermark_text,
            fontsize=30,
            rotate=0,
            color=(0.59, 0.59, 0.59)
        )
    doc.save(output_path)
    return output_path


def extract_metadata(pdf_path: str) -> dict:
    """Return PDF metadata dictionary."""
    with fitz.open(pdf_path) as doc:
        return doc.metadata or {}