codemaker2015 commited on
Commit
1e9ae83
·
1 Parent(s): bbddb9f

first commit

Browse files
Files changed (8) hide show
  1. .python-version +1 -0
  2. Dockerfile +22 -0
  3. README.md +14 -7
  4. main.py +255 -0
  5. requirements.txt +23 -0
  6. utils/pdf_analysis.py +54 -0
  7. utils/pdf_export.py +28 -0
  8. utils/pdf_processing.py +195 -0
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ tesseract-ocr \
6
+ poppler-utils \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Set work directory
10
+ WORKDIR /app
11
+
12
+ # Copy requirements
13
+ COPY requirements.txt .
14
+
15
+ # Install python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy app code
19
+ COPY . .
20
+
21
+ # Run Streamlit app
22
+ CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
README.md CHANGED
@@ -1,11 +1,18 @@
1
  ---
2
- title: Pdf Toolbox
3
- emoji: 📉
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: docker
 
 
7
  pinned: false
8
- license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: PDF AI Toolkit (Streamlit + FAISS RAG)
3
+ emoji: 🧰
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ app_file: main.py
8
+ python_version: "3.10"
9
  pinned: false
 
10
  ---
11
 
12
+ # PDF AI Toolkit
13
+
14
+ A Streamlit app for PDF processing (split/merge/extract/rotate/watermark/metadata), OCR (Tesseract), table & image extraction, and analysis (FAISS RAG + Together.ai Llama).
15
+ See the sidebar to pick **PDF Processing**, **Advanced processing**, **Analysis**, or **Export** tools.
16
+
17
+ ## Secrets
18
+ Add `TOGETHER_API_KEY` in your Space → **Settings → Secrets**.
main.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import zipfile
4
+ import io
5
+ import streamlit as st
6
+ from utils.pdf_processing import (
7
+ split_pdf_pages, merge_pdfs, extract_page_range, remove_first_last_pages,
8
+ extract_text_from_pdf, keyword_highlight_pdf, extract_images, extract_tables,
9
+ ocr_pdf, reorder_pages, rotate_pages, add_watermark, extract_metadata
10
+ )
11
+ from utils.pdf_analysis import rag_qa, summarize_text
12
+ from utils.pdf_export import export_to_word, export_to_text, export_text_to_markdown
13
+
14
+
15
+ st.set_page_config(page_title="PDF Toolkit", layout="wide")
16
+ st.title("🛠 PDF Toolkit")
17
+
18
+ with st.sidebar:
19
+ st.header("Tools")
20
+ # Define tool categories
21
+ tool_categories = {
22
+ "PDF Processing": [
23
+ "Split PDF Pages",
24
+ "Merge PDFs",
25
+ "Extract Page Range",
26
+ "Remove First/Last Pages",
27
+ ],
28
+ "Advanced Processing": [
29
+ "Keyword Search & Highlight",
30
+ "Extract Images",
31
+ "Extract Tables",
32
+ "OCR Scanned PDF",
33
+ "Reorder Pages",
34
+ "Rotate Pages",
35
+ "Add Watermark",
36
+ "Extract Metadata",
37
+ ],
38
+ "Analysis": [
39
+ "Summarize PDF",
40
+ "Ask Questions on PDF (RAG)",
41
+ ],
42
+ "Export": [
43
+ "Export to Word (.docx)",
44
+ "Export to Text (.txt)",
45
+ "Export to Markdown (.md)",
46
+ ],
47
+ }
48
+
49
+ # Step 1: User selects category
50
+ selected_category = st.selectbox(
51
+ "Choose a Category",
52
+ list(tool_categories.keys())
53
+ )
54
+
55
+ # Step 2: Show tools under that category
56
+ tool = st.selectbox(
57
+ "Choose a Tool",
58
+ tool_categories[selected_category]
59
+ )
60
+ # st.caption("Note: For OCR, ensure Tesseract is installed on system path.")
61
+
62
+
63
+ # ------------- Helpers for downloads -------------
64
+ def download_bytes(label: str, data: bytes, file_name: str, mime: str):
65
+ st.download_button(label, data, file_name=file_name, mime=mime)
66
+
67
+
68
+ def zip_folder_to_bytes(folder_path: str) -> bytes:
69
+ mem = io.BytesIO()
70
+ with zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) as zf:
71
+ for root, _, files in os.walk(folder_path):
72
+ for f in files:
73
+ full = os.path.join(root, f)
74
+ arc = os.path.relpath(full, start=folder_path)
75
+ zf.write(full, arcname=arc)
76
+ mem.seek(0)
77
+ return mem.read()
78
+
79
+
80
+ OUTPUT_DIR = "outputs"
81
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
82
+ def out_file(name: str) -> str:
83
+ return os.path.join(OUTPUT_DIR, name)
84
+
85
+ # ------------- UI Logic -------------
86
+
87
+ # Most tools need a PDF file, except "Merge PDFs"
88
+ if tool == "Merge PDFs":
89
+ uploaded_files = st.file_uploader("Upload PDFs to merge", type=["pdf"], accept_multiple_files=True)
90
+ if uploaded_files:
91
+ if st.button("Merge"):
92
+ merged = merge_pdfs(uploaded_files) # returns BytesIO
93
+ download_bytes("📥 Download Merged PDF", merged.getvalue(), "merged.pdf", "application/pdf")
94
+ else:
95
+ uploaded = st.file_uploader("Upload PDF", type=["pdf"])
96
+ if uploaded:
97
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
98
+ tmp.write(uploaded.read())
99
+ pdf_path = tmp.name
100
+
101
+ if tool == "Split PDF Pages":
102
+ # Read total pages for UI
103
+ from PyPDF2 import PdfReader as _Reader
104
+ total_pages = len(_Reader(pdf_path).pages)
105
+ option = st.radio("Select Option", ["All Pages", "Page Range", "Single Page"], horizontal=True)
106
+
107
+ if option == "All Pages":
108
+ start_page, end_page = 1, total_pages
109
+ elif option == "Page Range":
110
+ start_page = st.number_input("Start Page", 1, total_pages, 1)
111
+ end_page = st.number_input("End Page", start_page, total_pages, total_pages)
112
+ else:
113
+ start_page = st.number_input("Page Number", 1, total_pages, 1)
114
+ end_page = start_page
115
+
116
+ if st.button("Split & Download ZIP"):
117
+ zip_bytesio = split_pdf_pages(pdf_path, int(start_page), int(end_page))
118
+ download_bytes("📥 Download ZIP", zip_bytesio.getvalue(), "split_pages.zip", "application/zip")
119
+
120
+ elif tool == "Extract Page Range":
121
+ from PyPDF2 import PdfReader as _Reader
122
+ total_pages = len(_Reader(pdf_path).pages)
123
+ start_page = st.number_input("Start Page", 1, total_pages, 1)
124
+ end_page = st.number_input("End Page", start_page, total_pages, total_pages)
125
+ if st.button("Extract Range"):
126
+ out = extract_page_range(pdf_path, int(start_page), int(end_page))
127
+ download_bytes("📥 Download Extracted PDF", out.getvalue(), "extracted_range.pdf", "application/pdf")
128
+
129
+ elif tool == "Remove First/Last Pages":
130
+ remove_first = st.checkbox("Remove First Page", value=True)
131
+ remove_last = st.checkbox("Remove Last Page", value=False)
132
+ if st.button("Remove & Download"):
133
+ out = remove_first_last_pages(pdf_path, remove_first, remove_last)
134
+ download_bytes("📥 Download Modified PDF", out.getvalue(), "modified.pdf", "application/pdf")
135
+
136
+ elif tool == "Keyword Search & Highlight":
137
+ keyword = st.text_input("Keyword to highlight", "")
138
+ if st.button("Search & Highlight") and keyword.strip():
139
+ out_path = keyword_highlight_pdf(pdf_path, keyword.strip(), out_file("highlighted.pdf"))
140
+ with open(out_path, "rb") as f:
141
+ download_bytes("📥 Download Highlighted PDF", f.read(), "highlighted.pdf", "application/pdf")
142
+
143
+ elif tool == "Extract Images":
144
+ folder = extract_images(pdf_path, output_folder="images_out")
145
+ # st.success(f"Extracted images → {folder}")
146
+ if isinstance(folder, list):
147
+ folder = folder[0]
148
+
149
+ if os.path.isdir(folder) and len(os.listdir(folder)) > 0:
150
+ zbytes = zip_folder_to_bytes(folder)
151
+ download_bytes("📥 Download Images (ZIP)", zbytes, "images.zip", "application/zip")
152
+ image_files = [os.path.join(folder, f) for f in sorted(os.listdir(folder))]
153
+ # st.write("### Extracted Images Preview")
154
+ cols = st.columns(3) # grid with 3 columns
155
+ for i, img in enumerate(image_files):
156
+ with cols[i % 3]:
157
+ st.image(img, caption=os.path.basename(img), use_container_width=True)
158
+
159
+ elif tool == "Extract Tables":
160
+ tables = extract_tables(pdf_path)
161
+ if not tables:
162
+ st.info("No tables detected.")
163
+ else:
164
+ try:
165
+ import pandas as pd
166
+ except ImportError:
167
+ st.warning("Install pandas to view tables nicely.")
168
+ st.write(tables)
169
+ else:
170
+ for i, t in enumerate(tables):
171
+ if hasattr(t, "to_csv"): # Camelot DataFrame
172
+ df = t
173
+ else: # pdfplumber list-of-rows
174
+ df = pd.DataFrame(t)
175
+ st.subheader(f"Table {i+1}")
176
+ st.dataframe(df)
177
+
178
+ elif tool == "OCR Scanned PDF":
179
+ # st.info("Requires Tesseract installed on your system.")
180
+ lang = st.selectbox(
181
+ "Select OCR language",
182
+ ["eng", "hin", "fra", "deu", "jpn", "kor"], # Add more as needed
183
+ index=0
184
+ )
185
+ if st.button("Run OCR"):
186
+ text = ocr_pdf(pdf_path, lang)
187
+ st.text_area("OCR Output", text, height=300)
188
+
189
+ elif tool == "Reorder Pages":
190
+ st.caption("Enter comma-separated 0-indexed page order. Example for 3 pages: 2,0,1")
191
+ order_str = st.text_input("New order", "")
192
+ if st.button("Reorder") and order_str.strip():
193
+ new_order = [int(x.strip()) for x in order_str.split(",") if x.strip().isdigit()]
194
+ out_path = reorder_pages(pdf_path, new_order, out_file("reordered.pdf"))
195
+ with open(out_path, "rb") as f:
196
+ download_bytes("📥 Download Reordered PDF", f.read(), "reordered.pdf", "application/pdf")
197
+
198
+ elif tool == "Rotate Pages":
199
+ st.caption("Enter 0-indexed pages, comma-separated. Angle typically 90/180/270.")
200
+ pages_str = st.text_input("Pages to rotate", "")
201
+ angle = st.number_input("Angle", min_value=0, max_value=360, value=90, step=90)
202
+ if st.button("Rotate") and pages_str.strip():
203
+ pages = [int(x.strip()) for x in pages_str.split(",") if x.strip().isdigit()]
204
+ out_path = rotate_pages(pdf_path, pages, int(angle), out_file("rotated.pdf"))
205
+ with open(out_path, "rb") as f:
206
+ download_bytes("📥 Download Rotated PDF", f.read(), "rotated.pdf", "application/pdf")
207
+
208
+ elif tool == "Add Watermark":
209
+ wm = st.text_input("Watermark text", "CONFIDENTIAL")
210
+ if st.button("Apply Watermark"):
211
+ out_path = add_watermark(pdf_path, wm, out_file("watermarked.pdf"))
212
+ with open(out_path, "rb") as f:
213
+ download_bytes("📥 Download Watermarked PDF", f.read(), "watermarked.pdf", "application/pdf")
214
+
215
+ elif tool == "Extract Metadata":
216
+ meta = extract_metadata(pdf_path)
217
+ st.json(meta)
218
+
219
+ elif tool == "Summarize PDF":
220
+ text = extract_text_from_pdf(pdf_path)
221
+ # st.info("Using Together.ai LLaMA for summarization. Set TOGETHER_API_KEY in your environment.")
222
+ if st.button("Summarize"):
223
+ with st.spinner("Summarizing... Please wait ⏳"):
224
+ summary = summarize_text(text)
225
+ st.write(summary)
226
+
227
+ elif tool == "Ask Questions on PDF (RAG)":
228
+ # st.info("Uses FAISS + MiniLM embeddings + Together.ai LLaMA. Set TOGETHER_API_KEY in your environment.")
229
+ question = st.text_input("Your question")
230
+ if st.button("Ask") and question.strip():
231
+ text = extract_text_from_pdf(pdf_path)
232
+ with st.spinner("Analyzing... Please wait ⏳"):
233
+ answer, sources = rag_qa(text, question)
234
+ st.subheader("Answer")
235
+ st.write(answer)
236
+ if sources:
237
+ st.subheader("Top source chunks")
238
+ for i, s in enumerate(sources, start=1):
239
+ st.markdown(f"**Source {i}:**\n\n{getattr(s, 'page_content', '')[:800]}")
240
+
241
+ elif tool == "Export to Word (.docx)":
242
+ out = export_to_word(pdf_path, out_file("export.docx"))
243
+ with open(out, "rb") as f:
244
+ download_bytes("📥 Download DOCX", f.read(), "export.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
245
+
246
+ elif tool == "Export to Text (.txt)":
247
+ out = export_to_text(pdf_path, out_file("export.txt"))
248
+ with open(out, "rb") as f:
249
+ download_bytes("📥 Download TXT", f.read(), "export.txt", "text/plain")
250
+
251
+ elif tool == "Export to Markdown (.md)":
252
+ text = extract_text_from_pdf(pdf_path)
253
+ out = export_text_to_markdown(text, out_file("export.md"))
254
+ with open(out, "rb") as f:
255
+ download_bytes("📥 Download MD", f.read(), "export.md", "text/markdown")
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pymupdf==1.24.9
3
+ PyPDF2
4
+ pdfplumber
5
+ pytesseract
6
+ pdf2image
7
+ Pillow
8
+ faiss-cpu
9
+ sentence-transformers
10
+ langchain>=0.2.0
11
+ langchain-community>=0.2.0
12
+ huggingface_hub
13
+ together
14
+ python-docx
15
+ pypandoc
16
+ numpy
17
+ pandas
18
+ tesseract-ocr
19
+ tesseract-ocr-eng
20
+ poppler-utils
21
+ ghostscript
22
+ libgl1
23
+ pandoc
utils/pdf_analysis.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.vectorstores import FAISS
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.chains import RetrievalQA
6
+ from langchain.chat_models import ChatOpenAI
7
+
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ # Build FAISS retriever from raw text
12
+ def build_retriever_from_text(text: str, chunk_size: int = 800, overlap: int = 100, k: int = 3):
13
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
14
+ chunks = splitter.split_text(text)
15
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
16
+ vs = FAISS.from_texts(chunks, embeddings)
17
+ return vs.as_retriever(search_kwargs={"k": k})
18
+
19
+
20
+ # Together.ai LLM (LLaMA) factory
21
+ def together_llm(model: str = "meta-llama/Llama-Vision-Free", temperature: float = 0.2, max_tokens: int = 512):
22
+ return ChatOpenAI(
23
+ model=model,
24
+ temperature=temperature,
25
+ max_tokens=max_tokens,
26
+ openai_api_key=os.getenv("TOGETHER_API_KEY"),
27
+ openai_api_base="https://api.together.xyz/v1"
28
+ )
29
+
30
+
31
+ # Q&A over PDF (RAG)
32
+ def rag_qa(text: str, question: str, model: str = "meta-llama/Llama-Vision-Free"):
33
+ retriever = build_retriever_from_text(text)
34
+ llm = together_llm(model=model)
35
+ qa = RetrievalQA.from_chain_type(
36
+ llm=llm,
37
+ retriever=retriever,
38
+ return_source_documents=True,
39
+ chain_type="stuff"
40
+ )
41
+ result = qa({"query": question})
42
+ return result["result"], result.get("source_documents", [])
43
+
44
+
45
+ # Summarize PDF text
46
+ def summarize_text(text: str, model: str = "meta-llama/Llama-Vision-Free"):
47
+ prompt = (
48
+ "You are a concise technical summarizer. Summarize the following document in 6-10 bullet points, "
49
+ "preserving key facts, numbers, and definitions. Text:\n\n"
50
+ f"{text}"
51
+ )
52
+ llm = together_llm(model=model, temperature=0.2, max_tokens=400)
53
+ output = llm.invoke(prompt)
54
+ return output.content.strip()
utils/pdf_export.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ from pdf2docx import Converter
3
+ import pypandoc
4
+
5
+
6
+ def export_to_word(pdf_path: str, output_path: str = "output.docx") -> str:
7
+ """Convert PDF to DOCX (layout-aware)."""
8
+ cv = Converter(pdf_path)
9
+ cv.convert(output_path, start=0, end=None)
10
+ cv.close()
11
+ return output_path
12
+
13
+
14
+ def export_to_text(pdf_path: str, output_path: str = "output.txt") -> str:
15
+ """Export selectable text to TXT."""
16
+ text = ""
17
+ with fitz.open(pdf_path) as doc:
18
+ for page in doc:
19
+ text += page.get_text()
20
+ with open(output_path, "w", encoding="utf-8") as f:
21
+ f.write(text)
22
+ return output_path
23
+
24
+
25
+ def export_text_to_markdown(text: str, output_path: str = "output.md") -> str:
26
+ """Export text (already extracted) to Markdown."""
27
+ pypandoc.convert_text(text, "md", format="md", outputfile=output_path, extra_args=["--standalone"])
28
+ return output_path
utils/pdf_processing.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import zipfile
4
+ import fitz # PyMuPDF
5
+ from PIL import Image
6
+ import pytesseract
7
+ import pdfplumber
8
+ import camelot
9
+ from PyPDF2 import PdfReader, PdfWriter
10
+
11
+
12
+ # -------------------------
13
+ # BASIC PDF TOOLS (your originals)
14
+ # -------------------------
15
+
16
+ def split_pdf_pages(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
17
+ """
18
+ Split selected pages into separate PDFs and return a ZIP (in-memory).
19
+ start_page/end_page are 1-indexed (inclusive).
20
+ """
21
+ reader = PdfReader(pdf_path)
22
+ zip_buffer = io.BytesIO()
23
+ with zipfile.ZipFile(zip_buffer, "w") as zipf:
24
+ for i in range(start_page, end_page + 1):
25
+ writer = PdfWriter()
26
+ writer.add_page(reader.pages[i - 1])
27
+ pdf_bytes = io.BytesIO()
28
+ writer.write(pdf_bytes)
29
+ pdf_bytes.seek(0)
30
+ zipf.writestr(f"page_{i}.pdf", pdf_bytes.read())
31
+ zip_buffer.seek(0)
32
+ return zip_buffer
33
+
34
+
35
+ def merge_pdfs(files_or_paths) -> io.BytesIO:
36
+ """
37
+ Merge multiple PDFs. Accepts a list of file-like objects or file paths.
38
+ Returns merged PDF as BytesIO.
39
+ """
40
+ writer = PdfWriter()
41
+ for f in files_or_paths:
42
+ reader = PdfReader(f) if hasattr(f, "read") else PdfReader(str(f))
43
+ for page in reader.pages:
44
+ writer.add_page(page)
45
+ out = io.BytesIO()
46
+ writer.write(out)
47
+ out.seek(0)
48
+ return out
49
+
50
+
51
+ def extract_page_range(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
52
+ """Extract a page range (1-indexed, inclusive) into a single PDF (in-memory)."""
53
+ reader = PdfReader(pdf_path)
54
+ writer = PdfWriter()
55
+ for i in range(start_page, end_page + 1):
56
+ writer.add_page(reader.pages[i - 1])
57
+ out = io.BytesIO()
58
+ writer.write(out)
59
+ out.seek(0)
60
+ return out
61
+
62
+
63
+ def remove_first_last_pages(pdf_path: str, remove_first: bool, remove_last: bool) -> io.BytesIO:
64
+ """Remove first and/or last page and return modified PDF (in-memory)."""
65
+ reader = PdfReader(pdf_path)
66
+ writer = PdfWriter()
67
+ total = len(reader.pages)
68
+ for i in range(total):
69
+ if (remove_first and i == 0) or (remove_last and i == total - 1):
70
+ continue
71
+ writer.add_page(reader.pages[i])
72
+ out = io.BytesIO()
73
+ writer.write(out)
74
+ out.seek(0)
75
+ return out
76
+
77
+
78
+ # -------------------------
79
+ # ADVANCED UTILITIES
80
+ # -------------------------
81
+
82
+ def extract_text_from_pdf(pdf_path: str) -> str:
83
+ """Extract selectable text (not OCR) via PyMuPDF."""
84
+ text = ""
85
+ with fitz.open(pdf_path) as doc:
86
+ for page in doc:
87
+ text += page.get_text()
88
+ return text
89
+
90
+
91
+ def keyword_highlight_pdf(pdf_path: str, keyword: str, output_path: str = "highlighted.pdf") -> str:
92
+ """Search keyword and highlight occurrences in the PDF (case-insensitive)."""
93
+ doc = fitz.open(pdf_path)
94
+ for page in doc:
95
+ matches = page.search_for(keyword, quads=False)
96
+ for rect in matches:
97
+ page.add_highlight_annot(rect)
98
+ doc.save(output_path, garbage=4, deflate=True)
99
+ return output_path
100
+
101
+
102
+ def extract_images(pdf_path: str, output_folder: str = "extracted_images") -> list[str]:
103
+ """Extract embedded images to a folder; returns list of saved image paths."""
104
+ os.makedirs(output_folder, exist_ok=True)
105
+ doc = fitz.open(pdf_path)
106
+
107
+ for page_num in range(len(doc)):
108
+ page = doc[page_num]
109
+ images = page.get_images(full=True)
110
+ for img_index, img in enumerate(images):
111
+ xref = img[0]
112
+ base_image = doc.extract_image(xref)
113
+ image_bytes = base_image["image"]
114
+ image_ext = base_image["ext"]
115
+ image_filename = os.path.join(
116
+ output_folder,
117
+ f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
118
+ )
119
+ with open(image_filename, "wb") as f:
120
+ f.write(image_bytes)
121
+
122
+ return output_folder
123
+
124
+
125
+ def extract_tables(pdf_path: str):
126
+ """
127
+ Try Camelot first; fall back to pdfplumber.
128
+ Returns list of DataFrames (Camelot) or list-of-rows tables (pdfplumber).
129
+ """
130
+ try:
131
+ tables = camelot.read_pdf(pdf_path, pages="all")
132
+ if tables.n > 0:
133
+ return [t.df for t in tables]
134
+ except Exception:
135
+ pass
136
+
137
+ results = []
138
+ with pdfplumber.open(pdf_path) as pdf:
139
+ for page in pdf.pages:
140
+ page_tables = page.extract_tables()
141
+ results.extend(page_tables or [])
142
+ return results
143
+
144
+
145
+ def ocr_pdf(pdf_path: str, lang: str = "eng") -> str:
146
+ """OCR image-only pages via Tesseract and PyMuPDF rasterization."""
147
+ text = ""
148
+ with fitz.open(pdf_path) as doc:
149
+ for page in doc:
150
+ pix = page.get_pixmap()
151
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
152
+ custom_config = r'--oem 3 --psm 6'
153
+ text += pytesseract.image_to_string(img, lang=lang, config=custom_config) + "\n"
154
+ return text
155
+
156
+
157
+ def reorder_pages(pdf_path: str, new_order: list[int], output_path: str = "reordered.pdf") -> str:
158
+ """Reorder pages by 0-indexed positions. Saves to output_path."""
159
+ src = fitz.open(pdf_path)
160
+ dst = fitz.open()
161
+ for i in new_order:
162
+ dst.insert_pdf(src, from_page=i, to_page=i)
163
+ dst.save(output_path)
164
+ return output_path
165
+
166
+
167
+ def rotate_pages(pdf_path: str, pages_to_rotate: list[int], angle: int, output_path: str = "rotated.pdf") -> str:
168
+ """Rotate selected 0-indexed pages by angle (e.g., 90/180/270)."""
169
+ doc = fitz.open(pdf_path)
170
+ for p in pages_to_rotate:
171
+ doc[p].set_rotation(angle)
172
+ doc.save(output_path)
173
+ return output_path
174
+
175
+
176
+ def add_watermark(pdf_path: str, watermark_text: str, output_path: str = "watermarked.pdf") -> str:
177
+ """Add semi-transparent diagonal text watermark to all pages."""
178
+ doc = fitz.open(pdf_path)
179
+ for page in doc:
180
+ rect = page.rect
181
+ page.insert_text(
182
+ (rect.width * 0.25, rect.height * 0.5),
183
+ watermark_text,
184
+ fontsize=30,
185
+ rotate=0,
186
+ color=(0.59, 0.59, 0.59)
187
+ )
188
+ doc.save(output_path)
189
+ return output_path
190
+
191
+
192
+ def extract_metadata(pdf_path: str) -> dict:
193
+ """Return PDF metadata dictionary."""
194
+ with fitz.open(pdf_path) as doc:
195
+ return doc.metadata or {}