Spaces:
Sleeping
Sleeping
Commit
·
1e9ae83
1
Parent(s):
bbddb9f
first commit
Browse files- .python-version +1 -0
- Dockerfile +22 -0
- README.md +14 -7
- main.py +255 -0
- requirements.txt +23 -0
- utils/pdf_analysis.py +54 -0
- utils/pdf_export.py +28 -0
- utils/pdf_processing.py +195 -0
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.13
|
Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
# Install system dependencies
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
tesseract-ocr \
|
6 |
+
poppler-utils \
|
7 |
+
&& rm -rf /var/lib/apt/lists/*
|
8 |
+
|
9 |
+
# Set work directory
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
# Copy requirements
|
13 |
+
COPY requirements.txt .
|
14 |
+
|
15 |
+
# Install python dependencies
|
16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
+
|
18 |
+
# Copy app code
|
19 |
+
COPY . .
|
20 |
+
|
21 |
+
# Run Streamlit app
|
22 |
+
CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
|
README.md
CHANGED
@@ -1,11 +1,18 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
|
|
|
|
7 |
pinned: false
|
8 |
-
license: mit
|
9 |
---
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: PDF AI Toolkit (Streamlit + FAISS RAG)
|
3 |
+
emoji: 🧰
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: streamlit
|
7 |
+
app_file: main.py
|
8 |
+
python_version: "3.10"
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
+
# PDF AI Toolkit
|
13 |
+
|
14 |
+
A Streamlit app for PDF processing (split/merge/extract/rotate/watermark/metadata), OCR (Tesseract), table & image extraction, and analysis (FAISS RAG + Together.ai Llama).
|
15 |
+
See the sidebar to pick **PDF Processing**, **Advanced processing**, **Analysis**, or **Export** tools.
|
16 |
+
|
17 |
+
## Secrets
|
18 |
+
Add `TOGETHER_API_KEY` in your Space → **Settings → Secrets**.
|
main.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
import zipfile
|
4 |
+
import io
|
5 |
+
import streamlit as st
|
6 |
+
from utils.pdf_processing import (
|
7 |
+
split_pdf_pages, merge_pdfs, extract_page_range, remove_first_last_pages,
|
8 |
+
extract_text_from_pdf, keyword_highlight_pdf, extract_images, extract_tables,
|
9 |
+
ocr_pdf, reorder_pages, rotate_pages, add_watermark, extract_metadata
|
10 |
+
)
|
11 |
+
from utils.pdf_analysis import rag_qa, summarize_text
|
12 |
+
from utils.pdf_export import export_to_word, export_to_text, export_text_to_markdown
|
13 |
+
|
14 |
+
|
15 |
+
st.set_page_config(page_title="PDF Toolkit", layout="wide")
|
16 |
+
st.title("🛠 PDF Toolkit")
|
17 |
+
|
18 |
+
with st.sidebar:
|
19 |
+
st.header("Tools")
|
20 |
+
# Define tool categories
|
21 |
+
tool_categories = {
|
22 |
+
"PDF Processing": [
|
23 |
+
"Split PDF Pages",
|
24 |
+
"Merge PDFs",
|
25 |
+
"Extract Page Range",
|
26 |
+
"Remove First/Last Pages",
|
27 |
+
],
|
28 |
+
"Advanced Processing": [
|
29 |
+
"Keyword Search & Highlight",
|
30 |
+
"Extract Images",
|
31 |
+
"Extract Tables",
|
32 |
+
"OCR Scanned PDF",
|
33 |
+
"Reorder Pages",
|
34 |
+
"Rotate Pages",
|
35 |
+
"Add Watermark",
|
36 |
+
"Extract Metadata",
|
37 |
+
],
|
38 |
+
"Analysis": [
|
39 |
+
"Summarize PDF",
|
40 |
+
"Ask Questions on PDF (RAG)",
|
41 |
+
],
|
42 |
+
"Export": [
|
43 |
+
"Export to Word (.docx)",
|
44 |
+
"Export to Text (.txt)",
|
45 |
+
"Export to Markdown (.md)",
|
46 |
+
],
|
47 |
+
}
|
48 |
+
|
49 |
+
# Step 1: User selects category
|
50 |
+
selected_category = st.selectbox(
|
51 |
+
"Choose a Category",
|
52 |
+
list(tool_categories.keys())
|
53 |
+
)
|
54 |
+
|
55 |
+
# Step 2: Show tools under that category
|
56 |
+
tool = st.selectbox(
|
57 |
+
"Choose a Tool",
|
58 |
+
tool_categories[selected_category]
|
59 |
+
)
|
60 |
+
# st.caption("Note: For OCR, ensure Tesseract is installed on system path.")
|
61 |
+
|
62 |
+
|
63 |
+
# ------------- Helpers for downloads -------------
|
64 |
+
def download_bytes(label: str, data: bytes, file_name: str, mime: str):
|
65 |
+
st.download_button(label, data, file_name=file_name, mime=mime)
|
66 |
+
|
67 |
+
|
68 |
+
def zip_folder_to_bytes(folder_path: str) -> bytes:
|
69 |
+
mem = io.BytesIO()
|
70 |
+
with zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) as zf:
|
71 |
+
for root, _, files in os.walk(folder_path):
|
72 |
+
for f in files:
|
73 |
+
full = os.path.join(root, f)
|
74 |
+
arc = os.path.relpath(full, start=folder_path)
|
75 |
+
zf.write(full, arcname=arc)
|
76 |
+
mem.seek(0)
|
77 |
+
return mem.read()
|
78 |
+
|
79 |
+
|
80 |
+
OUTPUT_DIR = "outputs"
|
81 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
82 |
+
def out_file(name: str) -> str:
|
83 |
+
return os.path.join(OUTPUT_DIR, name)
|
84 |
+
|
85 |
+
# ------------- UI Logic -------------
|
86 |
+
|
87 |
+
# Most tools need a PDF file, except "Merge PDFs"
|
88 |
+
if tool == "Merge PDFs":
|
89 |
+
uploaded_files = st.file_uploader("Upload PDFs to merge", type=["pdf"], accept_multiple_files=True)
|
90 |
+
if uploaded_files:
|
91 |
+
if st.button("Merge"):
|
92 |
+
merged = merge_pdfs(uploaded_files) # returns BytesIO
|
93 |
+
download_bytes("📥 Download Merged PDF", merged.getvalue(), "merged.pdf", "application/pdf")
|
94 |
+
else:
|
95 |
+
uploaded = st.file_uploader("Upload PDF", type=["pdf"])
|
96 |
+
if uploaded:
|
97 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
98 |
+
tmp.write(uploaded.read())
|
99 |
+
pdf_path = tmp.name
|
100 |
+
|
101 |
+
if tool == "Split PDF Pages":
|
102 |
+
# Read total pages for UI
|
103 |
+
from PyPDF2 import PdfReader as _Reader
|
104 |
+
total_pages = len(_Reader(pdf_path).pages)
|
105 |
+
option = st.radio("Select Option", ["All Pages", "Page Range", "Single Page"], horizontal=True)
|
106 |
+
|
107 |
+
if option == "All Pages":
|
108 |
+
start_page, end_page = 1, total_pages
|
109 |
+
elif option == "Page Range":
|
110 |
+
start_page = st.number_input("Start Page", 1, total_pages, 1)
|
111 |
+
end_page = st.number_input("End Page", start_page, total_pages, total_pages)
|
112 |
+
else:
|
113 |
+
start_page = st.number_input("Page Number", 1, total_pages, 1)
|
114 |
+
end_page = start_page
|
115 |
+
|
116 |
+
if st.button("Split & Download ZIP"):
|
117 |
+
zip_bytesio = split_pdf_pages(pdf_path, int(start_page), int(end_page))
|
118 |
+
download_bytes("📥 Download ZIP", zip_bytesio.getvalue(), "split_pages.zip", "application/zip")
|
119 |
+
|
120 |
+
elif tool == "Extract Page Range":
|
121 |
+
from PyPDF2 import PdfReader as _Reader
|
122 |
+
total_pages = len(_Reader(pdf_path).pages)
|
123 |
+
start_page = st.number_input("Start Page", 1, total_pages, 1)
|
124 |
+
end_page = st.number_input("End Page", start_page, total_pages, total_pages)
|
125 |
+
if st.button("Extract Range"):
|
126 |
+
out = extract_page_range(pdf_path, int(start_page), int(end_page))
|
127 |
+
download_bytes("📥 Download Extracted PDF", out.getvalue(), "extracted_range.pdf", "application/pdf")
|
128 |
+
|
129 |
+
elif tool == "Remove First/Last Pages":
|
130 |
+
remove_first = st.checkbox("Remove First Page", value=True)
|
131 |
+
remove_last = st.checkbox("Remove Last Page", value=False)
|
132 |
+
if st.button("Remove & Download"):
|
133 |
+
out = remove_first_last_pages(pdf_path, remove_first, remove_last)
|
134 |
+
download_bytes("📥 Download Modified PDF", out.getvalue(), "modified.pdf", "application/pdf")
|
135 |
+
|
136 |
+
elif tool == "Keyword Search & Highlight":
|
137 |
+
keyword = st.text_input("Keyword to highlight", "")
|
138 |
+
if st.button("Search & Highlight") and keyword.strip():
|
139 |
+
out_path = keyword_highlight_pdf(pdf_path, keyword.strip(), out_file("highlighted.pdf"))
|
140 |
+
with open(out_path, "rb") as f:
|
141 |
+
download_bytes("📥 Download Highlighted PDF", f.read(), "highlighted.pdf", "application/pdf")
|
142 |
+
|
143 |
+
elif tool == "Extract Images":
|
144 |
+
folder = extract_images(pdf_path, output_folder="images_out")
|
145 |
+
# st.success(f"Extracted images → {folder}")
|
146 |
+
if isinstance(folder, list):
|
147 |
+
folder = folder[0]
|
148 |
+
|
149 |
+
if os.path.isdir(folder) and len(os.listdir(folder)) > 0:
|
150 |
+
zbytes = zip_folder_to_bytes(folder)
|
151 |
+
download_bytes("📥 Download Images (ZIP)", zbytes, "images.zip", "application/zip")
|
152 |
+
image_files = [os.path.join(folder, f) for f in sorted(os.listdir(folder))]
|
153 |
+
# st.write("### Extracted Images Preview")
|
154 |
+
cols = st.columns(3) # grid with 3 columns
|
155 |
+
for i, img in enumerate(image_files):
|
156 |
+
with cols[i % 3]:
|
157 |
+
st.image(img, caption=os.path.basename(img), use_container_width=True)
|
158 |
+
|
159 |
+
elif tool == "Extract Tables":
|
160 |
+
tables = extract_tables(pdf_path)
|
161 |
+
if not tables:
|
162 |
+
st.info("No tables detected.")
|
163 |
+
else:
|
164 |
+
try:
|
165 |
+
import pandas as pd
|
166 |
+
except ImportError:
|
167 |
+
st.warning("Install pandas to view tables nicely.")
|
168 |
+
st.write(tables)
|
169 |
+
else:
|
170 |
+
for i, t in enumerate(tables):
|
171 |
+
if hasattr(t, "to_csv"): # Camelot DataFrame
|
172 |
+
df = t
|
173 |
+
else: # pdfplumber list-of-rows
|
174 |
+
df = pd.DataFrame(t)
|
175 |
+
st.subheader(f"Table {i+1}")
|
176 |
+
st.dataframe(df)
|
177 |
+
|
178 |
+
elif tool == "OCR Scanned PDF":
|
179 |
+
# st.info("Requires Tesseract installed on your system.")
|
180 |
+
lang = st.selectbox(
|
181 |
+
"Select OCR language",
|
182 |
+
["eng", "hin", "fra", "deu", "jpn", "kor"], # Add more as needed
|
183 |
+
index=0
|
184 |
+
)
|
185 |
+
if st.button("Run OCR"):
|
186 |
+
text = ocr_pdf(pdf_path, lang)
|
187 |
+
st.text_area("OCR Output", text, height=300)
|
188 |
+
|
189 |
+
elif tool == "Reorder Pages":
|
190 |
+
st.caption("Enter comma-separated 0-indexed page order. Example for 3 pages: 2,0,1")
|
191 |
+
order_str = st.text_input("New order", "")
|
192 |
+
if st.button("Reorder") and order_str.strip():
|
193 |
+
new_order = [int(x.strip()) for x in order_str.split(",") if x.strip().isdigit()]
|
194 |
+
out_path = reorder_pages(pdf_path, new_order, out_file("reordered.pdf"))
|
195 |
+
with open(out_path, "rb") as f:
|
196 |
+
download_bytes("📥 Download Reordered PDF", f.read(), "reordered.pdf", "application/pdf")
|
197 |
+
|
198 |
+
elif tool == "Rotate Pages":
|
199 |
+
st.caption("Enter 0-indexed pages, comma-separated. Angle typically 90/180/270.")
|
200 |
+
pages_str = st.text_input("Pages to rotate", "")
|
201 |
+
angle = st.number_input("Angle", min_value=0, max_value=360, value=90, step=90)
|
202 |
+
if st.button("Rotate") and pages_str.strip():
|
203 |
+
pages = [int(x.strip()) for x in pages_str.split(",") if x.strip().isdigit()]
|
204 |
+
out_path = rotate_pages(pdf_path, pages, int(angle), out_file("rotated.pdf"))
|
205 |
+
with open(out_path, "rb") as f:
|
206 |
+
download_bytes("📥 Download Rotated PDF", f.read(), "rotated.pdf", "application/pdf")
|
207 |
+
|
208 |
+
elif tool == "Add Watermark":
|
209 |
+
wm = st.text_input("Watermark text", "CONFIDENTIAL")
|
210 |
+
if st.button("Apply Watermark"):
|
211 |
+
out_path = add_watermark(pdf_path, wm, out_file("watermarked.pdf"))
|
212 |
+
with open(out_path, "rb") as f:
|
213 |
+
download_bytes("📥 Download Watermarked PDF", f.read(), "watermarked.pdf", "application/pdf")
|
214 |
+
|
215 |
+
elif tool == "Extract Metadata":
|
216 |
+
meta = extract_metadata(pdf_path)
|
217 |
+
st.json(meta)
|
218 |
+
|
219 |
+
elif tool == "Summarize PDF":
|
220 |
+
text = extract_text_from_pdf(pdf_path)
|
221 |
+
# st.info("Using Together.ai LLaMA for summarization. Set TOGETHER_API_KEY in your environment.")
|
222 |
+
if st.button("Summarize"):
|
223 |
+
with st.spinner("Summarizing... Please wait ⏳"):
|
224 |
+
summary = summarize_text(text)
|
225 |
+
st.write(summary)
|
226 |
+
|
227 |
+
elif tool == "Ask Questions on PDF (RAG)":
|
228 |
+
# st.info("Uses FAISS + MiniLM embeddings + Together.ai LLaMA. Set TOGETHER_API_KEY in your environment.")
|
229 |
+
question = st.text_input("Your question")
|
230 |
+
if st.button("Ask") and question.strip():
|
231 |
+
text = extract_text_from_pdf(pdf_path)
|
232 |
+
with st.spinner("Analyzing... Please wait ⏳"):
|
233 |
+
answer, sources = rag_qa(text, question)
|
234 |
+
st.subheader("Answer")
|
235 |
+
st.write(answer)
|
236 |
+
if sources:
|
237 |
+
st.subheader("Top source chunks")
|
238 |
+
for i, s in enumerate(sources, start=1):
|
239 |
+
st.markdown(f"**Source {i}:**\n\n{getattr(s, 'page_content', '')[:800]}")
|
240 |
+
|
241 |
+
elif tool == "Export to Word (.docx)":
|
242 |
+
out = export_to_word(pdf_path, out_file("export.docx"))
|
243 |
+
with open(out, "rb") as f:
|
244 |
+
download_bytes("📥 Download DOCX", f.read(), "export.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
245 |
+
|
246 |
+
elif tool == "Export to Text (.txt)":
|
247 |
+
out = export_to_text(pdf_path, out_file("export.txt"))
|
248 |
+
with open(out, "rb") as f:
|
249 |
+
download_bytes("📥 Download TXT", f.read(), "export.txt", "text/plain")
|
250 |
+
|
251 |
+
elif tool == "Export to Markdown (.md)":
|
252 |
+
text = extract_text_from_pdf(pdf_path)
|
253 |
+
out = export_text_to_markdown(text, out_file("export.md"))
|
254 |
+
with open(out, "rb") as f:
|
255 |
+
download_bytes("📥 Download MD", f.read(), "export.md", "text/markdown")
|
requirements.txt
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pymupdf==1.24.9
|
3 |
+
PyPDF2
|
4 |
+
pdfplumber
|
5 |
+
pytesseract
|
6 |
+
pdf2image
|
7 |
+
Pillow
|
8 |
+
faiss-cpu
|
9 |
+
sentence-transformers
|
10 |
+
langchain>=0.2.0
|
11 |
+
langchain-community>=0.2.0
|
12 |
+
huggingface_hub
|
13 |
+
together
|
14 |
+
python-docx
|
15 |
+
pypandoc
|
16 |
+
numpy
|
17 |
+
pandas
|
18 |
+
tesseract-ocr
|
19 |
+
tesseract-ocr-eng
|
20 |
+
poppler-utils
|
21 |
+
ghostscript
|
22 |
+
libgl1
|
23 |
+
pandoc
|
utils/pdf_analysis.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain.vectorstores import FAISS
|
3 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# Build FAISS retriever from raw text
|
12 |
+
def build_retriever_from_text(text: str, chunk_size: int = 800, overlap: int = 100, k: int = 3):
|
13 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
|
14 |
+
chunks = splitter.split_text(text)
|
15 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
16 |
+
vs = FAISS.from_texts(chunks, embeddings)
|
17 |
+
return vs.as_retriever(search_kwargs={"k": k})
|
18 |
+
|
19 |
+
|
20 |
+
# Together.ai LLM (LLaMA) factory
|
21 |
+
def together_llm(model: str = "meta-llama/Llama-Vision-Free", temperature: float = 0.2, max_tokens: int = 512):
|
22 |
+
return ChatOpenAI(
|
23 |
+
model=model,
|
24 |
+
temperature=temperature,
|
25 |
+
max_tokens=max_tokens,
|
26 |
+
openai_api_key=os.getenv("TOGETHER_API_KEY"),
|
27 |
+
openai_api_base="https://api.together.xyz/v1"
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
# Q&A over PDF (RAG)
|
32 |
+
def rag_qa(text: str, question: str, model: str = "meta-llama/Llama-Vision-Free"):
|
33 |
+
retriever = build_retriever_from_text(text)
|
34 |
+
llm = together_llm(model=model)
|
35 |
+
qa = RetrievalQA.from_chain_type(
|
36 |
+
llm=llm,
|
37 |
+
retriever=retriever,
|
38 |
+
return_source_documents=True,
|
39 |
+
chain_type="stuff"
|
40 |
+
)
|
41 |
+
result = qa({"query": question})
|
42 |
+
return result["result"], result.get("source_documents", [])
|
43 |
+
|
44 |
+
|
45 |
+
# Summarize PDF text
|
46 |
+
def summarize_text(text: str, model: str = "meta-llama/Llama-Vision-Free"):
|
47 |
+
prompt = (
|
48 |
+
"You are a concise technical summarizer. Summarize the following document in 6-10 bullet points, "
|
49 |
+
"preserving key facts, numbers, and definitions. Text:\n\n"
|
50 |
+
f"{text}"
|
51 |
+
)
|
52 |
+
llm = together_llm(model=model, temperature=0.2, max_tokens=400)
|
53 |
+
output = llm.invoke(prompt)
|
54 |
+
return output.content.strip()
|
utils/pdf_export.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
from pdf2docx import Converter
|
3 |
+
import pypandoc
|
4 |
+
|
5 |
+
|
6 |
+
def export_to_word(pdf_path: str, output_path: str = "output.docx") -> str:
|
7 |
+
"""Convert PDF to DOCX (layout-aware)."""
|
8 |
+
cv = Converter(pdf_path)
|
9 |
+
cv.convert(output_path, start=0, end=None)
|
10 |
+
cv.close()
|
11 |
+
return output_path
|
12 |
+
|
13 |
+
|
14 |
+
def export_to_text(pdf_path: str, output_path: str = "output.txt") -> str:
|
15 |
+
"""Export selectable text to TXT."""
|
16 |
+
text = ""
|
17 |
+
with fitz.open(pdf_path) as doc:
|
18 |
+
for page in doc:
|
19 |
+
text += page.get_text()
|
20 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
21 |
+
f.write(text)
|
22 |
+
return output_path
|
23 |
+
|
24 |
+
|
25 |
+
def export_text_to_markdown(text: str, output_path: str = "output.md") -> str:
|
26 |
+
"""Export text (already extracted) to Markdown."""
|
27 |
+
pypandoc.convert_text(text, "md", format="md", outputfile=output_path, extra_args=["--standalone"])
|
28 |
+
return output_path
|
utils/pdf_processing.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import zipfile
|
4 |
+
import fitz # PyMuPDF
|
5 |
+
from PIL import Image
|
6 |
+
import pytesseract
|
7 |
+
import pdfplumber
|
8 |
+
import camelot
|
9 |
+
from PyPDF2 import PdfReader, PdfWriter
|
10 |
+
|
11 |
+
|
12 |
+
# -------------------------
|
13 |
+
# BASIC PDF TOOLS (your originals)
|
14 |
+
# -------------------------
|
15 |
+
|
16 |
+
def split_pdf_pages(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
|
17 |
+
"""
|
18 |
+
Split selected pages into separate PDFs and return a ZIP (in-memory).
|
19 |
+
start_page/end_page are 1-indexed (inclusive).
|
20 |
+
"""
|
21 |
+
reader = PdfReader(pdf_path)
|
22 |
+
zip_buffer = io.BytesIO()
|
23 |
+
with zipfile.ZipFile(zip_buffer, "w") as zipf:
|
24 |
+
for i in range(start_page, end_page + 1):
|
25 |
+
writer = PdfWriter()
|
26 |
+
writer.add_page(reader.pages[i - 1])
|
27 |
+
pdf_bytes = io.BytesIO()
|
28 |
+
writer.write(pdf_bytes)
|
29 |
+
pdf_bytes.seek(0)
|
30 |
+
zipf.writestr(f"page_{i}.pdf", pdf_bytes.read())
|
31 |
+
zip_buffer.seek(0)
|
32 |
+
return zip_buffer
|
33 |
+
|
34 |
+
|
35 |
+
def merge_pdfs(files_or_paths) -> io.BytesIO:
|
36 |
+
"""
|
37 |
+
Merge multiple PDFs. Accepts a list of file-like objects or file paths.
|
38 |
+
Returns merged PDF as BytesIO.
|
39 |
+
"""
|
40 |
+
writer = PdfWriter()
|
41 |
+
for f in files_or_paths:
|
42 |
+
reader = PdfReader(f) if hasattr(f, "read") else PdfReader(str(f))
|
43 |
+
for page in reader.pages:
|
44 |
+
writer.add_page(page)
|
45 |
+
out = io.BytesIO()
|
46 |
+
writer.write(out)
|
47 |
+
out.seek(0)
|
48 |
+
return out
|
49 |
+
|
50 |
+
|
51 |
+
def extract_page_range(pdf_path: str, start_page: int, end_page: int) -> io.BytesIO:
|
52 |
+
"""Extract a page range (1-indexed, inclusive) into a single PDF (in-memory)."""
|
53 |
+
reader = PdfReader(pdf_path)
|
54 |
+
writer = PdfWriter()
|
55 |
+
for i in range(start_page, end_page + 1):
|
56 |
+
writer.add_page(reader.pages[i - 1])
|
57 |
+
out = io.BytesIO()
|
58 |
+
writer.write(out)
|
59 |
+
out.seek(0)
|
60 |
+
return out
|
61 |
+
|
62 |
+
|
63 |
+
def remove_first_last_pages(pdf_path: str, remove_first: bool, remove_last: bool) -> io.BytesIO:
|
64 |
+
"""Remove first and/or last page and return modified PDF (in-memory)."""
|
65 |
+
reader = PdfReader(pdf_path)
|
66 |
+
writer = PdfWriter()
|
67 |
+
total = len(reader.pages)
|
68 |
+
for i in range(total):
|
69 |
+
if (remove_first and i == 0) or (remove_last and i == total - 1):
|
70 |
+
continue
|
71 |
+
writer.add_page(reader.pages[i])
|
72 |
+
out = io.BytesIO()
|
73 |
+
writer.write(out)
|
74 |
+
out.seek(0)
|
75 |
+
return out
|
76 |
+
|
77 |
+
|
78 |
+
# -------------------------
|
79 |
+
# ADVANCED UTILITIES
|
80 |
+
# -------------------------
|
81 |
+
|
82 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
83 |
+
"""Extract selectable text (not OCR) via PyMuPDF."""
|
84 |
+
text = ""
|
85 |
+
with fitz.open(pdf_path) as doc:
|
86 |
+
for page in doc:
|
87 |
+
text += page.get_text()
|
88 |
+
return text
|
89 |
+
|
90 |
+
|
91 |
+
def keyword_highlight_pdf(pdf_path: str, keyword: str, output_path: str = "highlighted.pdf") -> str:
|
92 |
+
"""Search keyword and highlight occurrences in the PDF (case-insensitive)."""
|
93 |
+
doc = fitz.open(pdf_path)
|
94 |
+
for page in doc:
|
95 |
+
matches = page.search_for(keyword, quads=False)
|
96 |
+
for rect in matches:
|
97 |
+
page.add_highlight_annot(rect)
|
98 |
+
doc.save(output_path, garbage=4, deflate=True)
|
99 |
+
return output_path
|
100 |
+
|
101 |
+
|
102 |
+
def extract_images(pdf_path: str, output_folder: str = "extracted_images") -> list[str]:
|
103 |
+
"""Extract embedded images to a folder; returns list of saved image paths."""
|
104 |
+
os.makedirs(output_folder, exist_ok=True)
|
105 |
+
doc = fitz.open(pdf_path)
|
106 |
+
|
107 |
+
for page_num in range(len(doc)):
|
108 |
+
page = doc[page_num]
|
109 |
+
images = page.get_images(full=True)
|
110 |
+
for img_index, img in enumerate(images):
|
111 |
+
xref = img[0]
|
112 |
+
base_image = doc.extract_image(xref)
|
113 |
+
image_bytes = base_image["image"]
|
114 |
+
image_ext = base_image["ext"]
|
115 |
+
image_filename = os.path.join(
|
116 |
+
output_folder,
|
117 |
+
f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
|
118 |
+
)
|
119 |
+
with open(image_filename, "wb") as f:
|
120 |
+
f.write(image_bytes)
|
121 |
+
|
122 |
+
return output_folder
|
123 |
+
|
124 |
+
|
125 |
+
def extract_tables(pdf_path: str):
|
126 |
+
"""
|
127 |
+
Try Camelot first; fall back to pdfplumber.
|
128 |
+
Returns list of DataFrames (Camelot) or list-of-rows tables (pdfplumber).
|
129 |
+
"""
|
130 |
+
try:
|
131 |
+
tables = camelot.read_pdf(pdf_path, pages="all")
|
132 |
+
if tables.n > 0:
|
133 |
+
return [t.df for t in tables]
|
134 |
+
except Exception:
|
135 |
+
pass
|
136 |
+
|
137 |
+
results = []
|
138 |
+
with pdfplumber.open(pdf_path) as pdf:
|
139 |
+
for page in pdf.pages:
|
140 |
+
page_tables = page.extract_tables()
|
141 |
+
results.extend(page_tables or [])
|
142 |
+
return results
|
143 |
+
|
144 |
+
|
145 |
+
def ocr_pdf(pdf_path: str, lang: str = "eng") -> str:
|
146 |
+
"""OCR image-only pages via Tesseract and PyMuPDF rasterization."""
|
147 |
+
text = ""
|
148 |
+
with fitz.open(pdf_path) as doc:
|
149 |
+
for page in doc:
|
150 |
+
pix = page.get_pixmap()
|
151 |
+
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
152 |
+
custom_config = r'--oem 3 --psm 6'
|
153 |
+
text += pytesseract.image_to_string(img, lang=lang, config=custom_config) + "\n"
|
154 |
+
return text
|
155 |
+
|
156 |
+
|
157 |
+
def reorder_pages(pdf_path: str, new_order: list[int], output_path: str = "reordered.pdf") -> str:
|
158 |
+
"""Reorder pages by 0-indexed positions. Saves to output_path."""
|
159 |
+
src = fitz.open(pdf_path)
|
160 |
+
dst = fitz.open()
|
161 |
+
for i in new_order:
|
162 |
+
dst.insert_pdf(src, from_page=i, to_page=i)
|
163 |
+
dst.save(output_path)
|
164 |
+
return output_path
|
165 |
+
|
166 |
+
|
167 |
+
def rotate_pages(pdf_path: str, pages_to_rotate: list[int], angle: int, output_path: str = "rotated.pdf") -> str:
|
168 |
+
"""Rotate selected 0-indexed pages by angle (e.g., 90/180/270)."""
|
169 |
+
doc = fitz.open(pdf_path)
|
170 |
+
for p in pages_to_rotate:
|
171 |
+
doc[p].set_rotation(angle)
|
172 |
+
doc.save(output_path)
|
173 |
+
return output_path
|
174 |
+
|
175 |
+
|
176 |
+
def add_watermark(pdf_path: str, watermark_text: str, output_path: str = "watermarked.pdf") -> str:
|
177 |
+
"""Add semi-transparent diagonal text watermark to all pages."""
|
178 |
+
doc = fitz.open(pdf_path)
|
179 |
+
for page in doc:
|
180 |
+
rect = page.rect
|
181 |
+
page.insert_text(
|
182 |
+
(rect.width * 0.25, rect.height * 0.5),
|
183 |
+
watermark_text,
|
184 |
+
fontsize=30,
|
185 |
+
rotate=0,
|
186 |
+
color=(0.59, 0.59, 0.59)
|
187 |
+
)
|
188 |
+
doc.save(output_path)
|
189 |
+
return output_path
|
190 |
+
|
191 |
+
|
192 |
+
def extract_metadata(pdf_path: str) -> dict:
|
193 |
+
"""Return PDF metadata dictionary."""
|
194 |
+
with fitz.open(pdf_path) as doc:
|
195 |
+
return doc.metadata or {}
|