bluenevus's picture
Update app.py
ff01f81 verified
import dash
import dash_bootstrap_components as dbc
from dash import html, dcc, Input, Output, State, ctx, ALL
import flask
import uuid
import os
import tempfile
import shutil
import logging
from flask import send_file
import threading
from PyPDF2 import PdfReader, PdfWriter
import re
import zipfile
import base64
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
SESSION_DATA = {}
SESSION_LOCKS = {}
def get_session_id_from_cookie():
sid = flask.request.cookies.get('session-id')
return sid
def get_or_create_session_id():
sid = flask.request.cookies.get('session-id')
if not sid:
sid = str(uuid.uuid4())
return sid
def get_session_dir(session_id):
base_tmp = tempfile.gettempdir()
path = os.path.join(base_tmp, f'dash_pdfsplit_{session_id}')
os.makedirs(path, exist_ok=True)
return path
def clean_session(session_id):
try:
session_dir = get_session_dir(session_id)
if os.path.exists(session_dir):
shutil.rmtree(session_dir)
SESSION_DATA.pop(session_id, None)
SESSION_LOCKS.pop(session_id, None)
logging.info(f"Cleaned session for {session_id}")
except Exception as e:
logging.error(f"Error cleaning session {session_id}: {e}")
def get_session_lock(session_id):
if session_id not in SESSION_LOCKS:
SESSION_LOCKS[session_id] = threading.Lock()
return SESSION_LOCKS[session_id]
def allowed_file(filename):
return '.' in filename and filename.lower().endswith('.pdf')
def extract_text_headers(reader, page_num):
try:
page = reader.pages[page_num]
text = page.extract_text() or ""
lines = [line.strip() for line in text.split('\n') if line.strip()]
header = lines[0] if lines else ""
return header
except Exception as e:
logging.warning(f"Failed extracting header from page {page_num}: {e}")
return ""
def is_blank_page(reader, page_num):
try:
page = reader.pages[page_num]
text = (page.extract_text() or "").strip()
return len(text) == 0
except Exception as e:
logging.warning(f"Failed to check blank page at {page_num}: {e}")
return False
def is_chapter_header(header):
patterns = [
r'^\s*chapter\b', r'^\s*section\b', r'^\s*part\b', r'^\s*appendix\b',
r'^\s*[ivxlcdm]+\.', r'^\s*\d+(\.\d+)*\s', r'^\s*introduction\b'
]
for pat in patterns:
if re.match(pat, header, re.IGNORECASE):
return True
return False
def estimate_writer_size(writer):
import io
f = io.BytesIO()
writer.write(f)
return f.tell()
def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
logging.info(f"intelligent_pdf_split: Starting split for {input_path} in {session_dir}")
reader = PdfReader(input_path)
n_pages = len(reader.pages)
splits = []
last_header = None
i = 0
while i < n_pages:
part_start = i
writer = PdfWriter()
writer.add_page(reader.pages[i])
size = estimate_writer_size(writer) / (1024 * 1024)
if size > max_mb:
splits.append((i, i+1))
i += 1
continue
j = i + 1
while j < n_pages:
tmp_writer = PdfWriter()
for k in range(part_start, j+1):
tmp_writer.add_page(reader.pages[k])
size = estimate_writer_size(tmp_writer) / (1024 * 1024)
if size > max_mb:
break
header = extract_text_headers(reader, j)
blank = is_blank_page(reader, j)
chapter = is_chapter_header(header)
if size >= min_split_mb and (blank or chapter or (header and header != last_header)):
j += 1
break
last_header = header
j += 1
splits.append((part_start, j))
i = j
split_files = []
input_size = os.path.getsize(input_path) / (1024 * 1024)
for idx, (start, end) in enumerate(splits):
writer = PdfWriter()
for p in range(start, end):
writer.add_page(reader.pages[p])
out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
with open(out_path, 'wb') as f:
writer.write(f)
size = os.path.getsize(out_path) / (1024 * 1024)
split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
logging.info(f"Saved split file {out_path} ({size:.2f} MB) for pages {start}-{end-1}")
total_output_size = sum([f['size'] for f in split_files])
logging.info(f"Original input size: {input_size:.2f} MB, Total split output size: {total_output_size:.2f} MB, {len(split_files)} files created.")
return split_files
def make_zip_of_splits(split_files, session_dir):
zip_path = os.path.join(session_dir, "split_files.zip")
logging.info(f"Creating ZIP at {zip_path} with {len(split_files)} files.")
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in split_files:
zipf.write(file['path'], arcname=file['filename'])
logging.info(f"Added {file['filename']} to ZIP.")
logging.info(f"ZIP created at {zip_path}")
return zip_path
external_stylesheets = [dbc.themes.BOOTSTRAP]
app = dash.Dash(__name__, external_stylesheets=external_stylesheets, suppress_callback_exceptions=True)
server = app.server
app.title = "Intelligent PDF Splitter"
def get_split_results_placeholder():
return html.Div("", id="split-results-inner")
def get_split_files_ui(split_files, session_id):
items = []
for i, fi in enumerate(split_files):
fname = fi['filename']
size = fi['size']
download_link = html.A(
f"Download {fname} ({size:.2f} MB)",
href=f"/download_split/{session_id}/{fname}",
target="_blank",
style={'marginRight': '16px'}
)
delete_btn = dbc.Button(
"Delete",
id={'type': 'delete-split-btn', 'index': i},
color='danger',
size='sm',
className='ms-3',
n_clicks=0
)
items.append(
html.Li(
dbc.Row([
dbc.Col(download_link, width=9, style={'display': 'flex', 'alignItems': 'center'}),
dbc.Col(delete_btn, width=3, style={'display': 'flex', 'justifyContent': 'end'})
], align='center'),
style={'marginBottom': '10px'}
)
)
return html.Ul(items)
app.layout = dbc.Container(
[
dcc.Store(id='session-id-store', storage_type='session'),
dcc.Store(id='session-store', storage_type='session'),
dcc.Download(id="download-zip-dcc"),
dbc.Row(
[
dbc.Col(
dbc.Card(
[
dbc.CardHeader(html.H2("Intelligent PDF Splitter")),
dbc.CardBody(
[
html.P("Upload your PDF. The tool will split it into context-preserving sections, each under 5MB."),
dcc.Upload(
id='upload-pdf',
children=html.Div([
'Drag and Drop or ',
html.A('Select PDF File')
]),
style={
'width': '100%', 'height': '80px', 'lineHeight': '80px',
'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
'textAlign': 'center', 'margin': '10px 0'
},
multiple=False,
accept='.pdf'
),
html.Div(id='file-info', className='mb-4'),
dbc.Row([
dbc.Col(
dbc.Button(
"Split PDF", id='split-btn',
color='primary',
style={'width': '180px', 'fontWeight': 'bold'},
n_clicks=0,
disabled=True
),
width="auto"
),
dbc.Col(
dbc.Button(
"Clear Session", id='clear-session',
color='secondary',
n_clicks=0,
className='ms-3'
),
width="auto"
),
], className='mb-3 mt-2', align='center', justify='start'),
dcc.Loading(
id="loading", type="default",
children=[html.Div(id='split-results', children=get_split_results_placeholder())]
)
]
)
],
className="mt-4"
),
width=12
),
]
)
],
fluid=True,
className="p-4"
)
@app.server.before_request
def persist_session_cookie():
session_id = flask.request.cookies.get('session-id')
if not session_id:
session_id = str(uuid.uuid4())
resp = flask.make_response("")
resp.set_cookie('session-id', session_id, max_age=60*60*24*3)
flask.g.session_id = session_id
flask.request.session_id_set = session_id
else:
flask.g.session_id = session_id
@app.callback(
Output('session-id-store', 'data'),
Input('session-id-store', 'data'),
prevent_initial_call=False
)
def ensure_session_id(session_id):
try:
sid = session_id
if not sid:
if hasattr(flask.request, 'session_id_set'):
sid = flask.request.session_id_set
else:
sid = get_or_create_session_id()
flask.g.session_id = sid
return sid
except Exception as e:
logging.error(f"Error ensuring session id: {e}")
sid = str(uuid.uuid4())
return sid
@app.callback(
Output('file-info', 'children'),
Output('split-btn', 'disabled'),
Output('split-results', 'children'),
Output('session-store', 'data'),
Input('upload-pdf', 'contents'),
State('upload-pdf', 'filename'),
Input('clear-session', 'n_clicks'),
Input({'type': 'delete-upload-btn', 'index': ALL}, 'n_clicks'),
Input('split-btn', 'n_clicks'),
Input({'type': 'delete-split-btn', 'index': ALL}, 'n_clicks'),
State('session-store', 'data'),
State('session-id-store', 'data'),
prevent_initial_call=True
)
def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, delete_split_n_list, session_data, session_id):
trigger = ctx.triggered_id
logging.info(f"handle_upload: Triggered by {trigger}, session_id={session_id}")
if not session_id:
session_id = str(uuid.uuid4())
logging.info(f"handle_upload: Generated new session_id {session_id}")
flask.g.session_id = session_id
session_dir = get_session_dir(session_id)
lock = get_session_lock(session_id)
if session_data is None:
session_data = {}
if trigger == 'clear-session':
logging.info(f"handle_upload: Clear session button pressed for {session_id}")
clean_session(session_id)
resp_data = {}
logging.info(f"Session cleared for {session_id}")
return "", True, get_split_results_placeholder(), resp_data
delete_pressed = False
if isinstance(trigger, dict) and trigger.get('type') == 'delete-upload-btn':
delete_pressed = True
if not delete_pressed and delete_upload_n_list is not None and len(delete_upload_n_list) > 0:
if any(n is not None and n > 0 for n in delete_upload_n_list):
delete_pressed = True
if delete_pressed:
orig_filename = session_data.get('orig_filename', '')
pdf_path = os.path.join(session_dir, orig_filename)
if os.path.exists(pdf_path):
os.remove(pdf_path)
logging.info(f"Deleted uploaded file {pdf_path} for session {session_id}")
session_data = {}
if os.path.exists(session_dir):
for file in os.listdir(session_dir):
os.remove(os.path.join(session_dir, file))
logging.info(f"Session files deleted for {session_id}")
return "", True, get_split_results_placeholder(), {}
delete_split_pressed = False
delete_split_idx = None
if isinstance(trigger, dict) and trigger.get('type') == 'delete-split-btn':
delete_split_pressed = True
delete_split_idx = trigger.get('index')
if not delete_split_pressed and delete_split_n_list is not None and len(delete_split_n_list) > 0:
for i, n in enumerate(delete_split_n_list):
if n is not None and n > 0:
delete_split_pressed = True
delete_split_idx = i
break
if delete_split_pressed and session_data.get('split_files'):
split_files = session_data['split_files']
if 0 <= delete_split_idx < len(split_files):
del_file = split_files[delete_split_idx]
file_path = del_file['path']
if os.path.exists(file_path):
os.remove(file_path)
logging.info(f"Deleted split file: {file_path} for session {session_id}")
split_files = [f for i, f in enumerate(split_files) if i != delete_split_idx]
session_data['split_files'] = split_files
zip_path = os.path.join(session_dir, "split_files.zip")
if split_files:
make_zip_of_splits(split_files, session_dir)
session_data['zip_ready'] = True
logging.info(f"Regenerated ZIP file after split delete for session {session_id}")
else:
if os.path.exists(zip_path):
os.remove(zip_path)
logging.info(f"Deleted ZIP file as no splits remain for session {session_id}")
session_data['zip_ready'] = False
orig_filename = session_data.get('orig_filename', '')
file_info = dbc.Row([
dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
dbc.Col(
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
width=3, style={'display': 'flex', 'justifyContent': 'end'}
)
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
results = html.Div([
html.H5("Split Files:"),
get_split_files_ui(split_files, session_id) if split_files else html.Div("No split files remain."),
html.Div(
dbc.Button(
"Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
id="download-zip-btn"
),
style={'marginTop': '30px', 'display': 'block' if split_files else 'none'}
),
], id="split-results-inner")
return file_info, False, results, session_data
else:
logging.warning(f"Split file delete index {delete_split_idx} invalid for session {session_id}")
if trigger == 'upload-pdf':
logging.info(f"handle_upload: Upload triggered for filename={filename}, session_id={session_id}")
if not contents:
logging.warning("No contents received in upload.")
return "", True, get_split_results_placeholder(), {}
if not allowed_file(filename):
logging.warning(f"Disallowed file attempted upload: {filename}")
return html.Div("Only .pdf files are allowed.", style={'color': 'red'}), True, get_split_results_placeholder(), {}
try:
header, b64data = contents.split(',', 1)
pdf_bytes = base64.b64decode(b64data)
pdf_path = os.path.join(session_dir, filename)
with open(pdf_path, 'wb') as f:
f.write(pdf_bytes)
logging.info(f"PDF uploaded and saved to {pdf_path} for session {session_id}")
session_data = {
'orig_filename': filename,
'split_files': None,
'zip_ready': False,
}
file_info = dbc.Row([
dbc.Col(html.Div(f"Uploaded: {filename} ({len(pdf_bytes)/1024/1024:.2f} MB)"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
dbc.Col(
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
width=3, style={'display': 'flex', 'justifyContent': 'end'}
)
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
logging.info(f"handle_upload: File info UI updated, split button enabled.")
return file_info, False, get_split_results_placeholder(), session_data
except Exception as e:
logging.error(f"Error processing PDF: {e}")
return html.Div(f"Error: {e}", style={'color': 'red'}), True, get_split_results_placeholder(), {}
if trigger == 'split-btn':
orig_filename = session_data.get('orig_filename')
logging.info(f"handle_upload: Split button clicked for {session_id}, orig_filename={orig_filename}")
if not orig_filename:
logging.error(f"Split button clicked but no file to split for session {session_id}")
return html.Div("No file to split.", style={'color': 'red'}), True, get_split_results_placeholder(), session_data
pdf_path = os.path.join(session_dir, orig_filename)
if not os.path.exists(pdf_path):
logging.error(f"Split button clicked but uploaded file not found for session {session_id}")
return html.Div("Uploaded file not found. Please upload again.", style={'color': 'red'}), True, get_split_results_placeholder(), {}
try:
logging.info(f"Splitting PDF for session {session_id}. File: {pdf_path}")
with lock:
split_files = intelligent_pdf_split(pdf_path, session_dir)
for fi in split_files:
logging.info(f"Split file saved: {fi['path']} ({fi['size']:.2f} MB)")
zip_path = make_zip_of_splits(split_files, session_dir)
logging.info(f"Split/ZIP finished for {session_id}, zip_path={zip_path}")
session_data['split_files'] = split_files
session_data['zip_ready'] = True
file_info = dbc.Row([
dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
dbc.Col(
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
width=3, style={'display': 'flex', 'justifyContent': 'end'}
)
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
split_files_list = get_split_files_ui(split_files, session_id)
download_zip_btn = dbc.Button(
"Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
id="download-zip-btn"
)
results = html.Div([
html.H5("Split Files:"),
split_files_list,
html.Div(download_zip_btn, style={'marginTop': '30px'})
], id="split-results-inner")
logging.info(f"PDF split into {len(split_files)} chunks for session {session_id}, zip ready.")
return file_info, False, results, session_data
except Exception as e:
logging.error(f"Error splitting PDF for session {session_id}: {e}")
return html.Div(f"Error: {e}", style={'color': 'red'}), False, get_split_results_placeholder(), session_data
if session_data.get('split_files'):
split_files = session_data['split_files']
orig_filename = session_data.get('orig_filename', '')
file_info = dbc.Row([
dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
dbc.Col(
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
width=3, style={'display': 'flex', 'justifyContent': 'end'}
)
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
split_files_list = get_split_files_ui(split_files, session_id)
download_zip_btn = dbc.Button(
"Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
id="download-zip-btn"
)
results = html.Div([
html.H5("Split Files:"),
split_files_list,
html.Div(download_zip_btn, style={'marginTop': '30px'})
], id="split-results-inner")
logging.info(f"handle_upload: Restoring split results for session {session_id}, {len(split_files)} files.")
return file_info, False, results, session_data
if session_data.get('orig_filename') and not session_data.get('split_files'):
file_info = dbc.Row([
dbc.Col(html.Div(f"Uploaded: {session_data['orig_filename']}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
dbc.Col(
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
width=3, style={'display': 'flex', 'justifyContent': 'end'}
)
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
logging.info(f"handle_upload: Restoring view after upload, split button enabled.")
return file_info, False, get_split_results_placeholder(), session_data
logging.info(f"handle_upload: No action taken, returning current session_data for session {session_id}")
return "", True, get_split_results_placeholder(), session_data
@app.callback(
Output("download-zip-dcc", "data"),
Input("download-zip-btn", "n_clicks"),
State('session-id-store', 'data'),
prevent_initial_call=True
)
def trigger_download_zip(n_clicks, session_id):
if not n_clicks or not session_id:
return dash.no_update
session_dir = get_session_dir(session_id)
zip_path = os.path.join(session_dir, "split_files.zip")
if os.path.exists(zip_path):
logging.info(f"trigger_download_zip: Sending zip {zip_path} for session {session_id}")
return dcc.send_file(zip_path)
else:
logging.error(f"trigger_download_zip: Zip file not found for session {session_id}")
return dash.no_update
@app.server.route('/download_zip/<session_id>/<filename>')
def download_zip_file(session_id, filename):
session_dir = get_session_dir(session_id)
file_path = os.path.join(session_dir, filename)
if os.path.exists(file_path):
logging.info(f"Serving zip file {file_path} for session {session_id}")
return send_file(file_path, mimetype='application/zip', as_attachment=True, download_name=filename)
else:
logging.error(f"ZIP file not found for download: {file_path}")
return "File not found", 404
@app.server.route('/download_split/<session_id>/<filename>')
def download_split_file(session_id, filename):
session_dir = get_session_dir(session_id)
file_path = os.path.join(session_dir, filename)
if os.path.exists(file_path):
logging.info(f"Serving split file {file_path} for session {session_id}")
return send_file(file_path, mimetype='application/pdf', as_attachment=True, download_name=filename)
else:
logging.error(f"Split file not found for download: {file_path}")
return "File not found", 404
if __name__ == '__main__':
print("Starting the Dash application...")
app.run(debug=False, host='0.0.0.0', port=7860, threaded=True)
print("Dash application has finished running.")