import dash import dash_bootstrap_components as dbc from dash import html, dcc, Input, Output, State, ctx, ALL import flask import uuid import os import tempfile import shutil import logging from flask import send_file import threading from PyPDF2 import PdfReader, PdfWriter import re import zipfile import base64 logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') SESSION_DATA = {} SESSION_LOCKS = {} def get_session_id_from_cookie(): sid = flask.request.cookies.get('session-id') return sid def get_or_create_session_id(): sid = flask.request.cookies.get('session-id') if not sid: sid = str(uuid.uuid4()) return sid def get_session_dir(session_id): base_tmp = tempfile.gettempdir() path = os.path.join(base_tmp, f'dash_pdfsplit_{session_id}') os.makedirs(path, exist_ok=True) return path def clean_session(session_id): try: session_dir = get_session_dir(session_id) if os.path.exists(session_dir): shutil.rmtree(session_dir) SESSION_DATA.pop(session_id, None) SESSION_LOCKS.pop(session_id, None) logging.info(f"Cleaned session for {session_id}") except Exception as e: logging.error(f"Error cleaning session {session_id}: {e}") def get_session_lock(session_id): if session_id not in SESSION_LOCKS: SESSION_LOCKS[session_id] = threading.Lock() return SESSION_LOCKS[session_id] def allowed_file(filename): return '.' in filename and filename.lower().endswith('.pdf') def extract_text_headers(reader, page_num): try: page = reader.pages[page_num] text = page.extract_text() or "" lines = [line.strip() for line in text.split('\n') if line.strip()] header = lines[0] if lines else "" return header except Exception as e: logging.warning(f"Failed extracting header from page {page_num}: {e}") return "" def is_blank_page(reader, page_num): try: page = reader.pages[page_num] text = (page.extract_text() or "").strip() return len(text) == 0 except Exception as e: logging.warning(f"Failed to check blank page at {page_num}: {e}") return False def is_chapter_header(header): patterns = [ r'^\s*chapter\b', r'^\s*section\b', r'^\s*part\b', r'^\s*appendix\b', r'^\s*[ivxlcdm]+\.', r'^\s*\d+(\.\d+)*\s', r'^\s*introduction\b' ] for pat in patterns: if re.match(pat, header, re.IGNORECASE): return True return False def estimate_writer_size(writer): import io f = io.BytesIO() writer.write(f) return f.tell() def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4): logging.info(f"intelligent_pdf_split: Starting split for {input_path} in {session_dir}") reader = PdfReader(input_path) n_pages = len(reader.pages) splits = [] last_header = None i = 0 while i < n_pages: part_start = i writer = PdfWriter() writer.add_page(reader.pages[i]) size = estimate_writer_size(writer) / (1024 * 1024) if size > max_mb: splits.append((i, i+1)) i += 1 continue j = i + 1 while j < n_pages: tmp_writer = PdfWriter() for k in range(part_start, j+1): tmp_writer.add_page(reader.pages[k]) size = estimate_writer_size(tmp_writer) / (1024 * 1024) if size > max_mb: break header = extract_text_headers(reader, j) blank = is_blank_page(reader, j) chapter = is_chapter_header(header) if size >= min_split_mb and (blank or chapter or (header and header != last_header)): j += 1 break last_header = header j += 1 splits.append((part_start, j)) i = j split_files = [] input_size = os.path.getsize(input_path) / (1024 * 1024) for idx, (start, end) in enumerate(splits): writer = PdfWriter() for p in range(start, end): writer.add_page(reader.pages[p]) out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf') with open(out_path, 'wb') as f: writer.write(f) size = os.path.getsize(out_path) / (1024 * 1024) split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path}) logging.info(f"Saved split file {out_path} ({size:.2f} MB) for pages {start}-{end-1}") total_output_size = sum([f['size'] for f in split_files]) logging.info(f"Original input size: {input_size:.2f} MB, Total split output size: {total_output_size:.2f} MB, {len(split_files)} files created.") return split_files def make_zip_of_splits(split_files, session_dir): zip_path = os.path.join(session_dir, "split_files.zip") logging.info(f"Creating ZIP at {zip_path} with {len(split_files)} files.") with zipfile.ZipFile(zip_path, 'w') as zipf: for file in split_files: zipf.write(file['path'], arcname=file['filename']) logging.info(f"Added {file['filename']} to ZIP.") logging.info(f"ZIP created at {zip_path}") return zip_path external_stylesheets = [dbc.themes.BOOTSTRAP] app = dash.Dash(__name__, external_stylesheets=external_stylesheets, suppress_callback_exceptions=True) server = app.server app.title = "Intelligent PDF Splitter" def get_split_results_placeholder(): return html.Div("", id="split-results-inner") def get_split_files_ui(split_files, session_id): items = [] for i, fi in enumerate(split_files): fname = fi['filename'] size = fi['size'] download_link = html.A( f"Download {fname} ({size:.2f} MB)", href=f"/download_split/{session_id}/{fname}", target="_blank", style={'marginRight': '16px'} ) delete_btn = dbc.Button( "Delete", id={'type': 'delete-split-btn', 'index': i}, color='danger', size='sm', className='ms-3', n_clicks=0 ) items.append( html.Li( dbc.Row([ dbc.Col(download_link, width=9, style={'display': 'flex', 'alignItems': 'center'}), dbc.Col(delete_btn, width=3, style={'display': 'flex', 'justifyContent': 'end'}) ], align='center'), style={'marginBottom': '10px'} ) ) return html.Ul(items) app.layout = dbc.Container( [ dcc.Store(id='session-id-store', storage_type='session'), dcc.Store(id='session-store', storage_type='session'), dcc.Download(id="download-zip-dcc"), dbc.Row( [ dbc.Col( dbc.Card( [ dbc.CardHeader(html.H2("Intelligent PDF Splitter")), dbc.CardBody( [ html.P("Upload your PDF. The tool will split it into context-preserving sections, each under 5MB."), dcc.Upload( id='upload-pdf', children=html.Div([ 'Drag and Drop or ', html.A('Select PDF File') ]), style={ 'width': '100%', 'height': '80px', 'lineHeight': '80px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px 0' }, multiple=False, accept='.pdf' ), html.Div(id='file-info', className='mb-4'), dbc.Row([ dbc.Col( dbc.Button( "Split PDF", id='split-btn', color='primary', style={'width': '180px', 'fontWeight': 'bold'}, n_clicks=0, disabled=True ), width="auto" ), dbc.Col( dbc.Button( "Clear Session", id='clear-session', color='secondary', n_clicks=0, className='ms-3' ), width="auto" ), ], className='mb-3 mt-2', align='center', justify='start'), dcc.Loading( id="loading", type="default", children=[html.Div(id='split-results', children=get_split_results_placeholder())] ) ] ) ], className="mt-4" ), width=12 ), ] ) ], fluid=True, className="p-4" ) @app.server.before_request def persist_session_cookie(): session_id = flask.request.cookies.get('session-id') if not session_id: session_id = str(uuid.uuid4()) resp = flask.make_response("") resp.set_cookie('session-id', session_id, max_age=60*60*24*3) flask.g.session_id = session_id flask.request.session_id_set = session_id else: flask.g.session_id = session_id @app.callback( Output('session-id-store', 'data'), Input('session-id-store', 'data'), prevent_initial_call=False ) def ensure_session_id(session_id): try: sid = session_id if not sid: if hasattr(flask.request, 'session_id_set'): sid = flask.request.session_id_set else: sid = get_or_create_session_id() flask.g.session_id = sid return sid except Exception as e: logging.error(f"Error ensuring session id: {e}") sid = str(uuid.uuid4()) return sid @app.callback( Output('file-info', 'children'), Output('split-btn', 'disabled'), Output('split-results', 'children'), Output('session-store', 'data'), Input('upload-pdf', 'contents'), State('upload-pdf', 'filename'), Input('clear-session', 'n_clicks'), Input({'type': 'delete-upload-btn', 'index': ALL}, 'n_clicks'), Input('split-btn', 'n_clicks'), Input({'type': 'delete-split-btn', 'index': ALL}, 'n_clicks'), State('session-store', 'data'), State('session-id-store', 'data'), prevent_initial_call=True ) def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, delete_split_n_list, session_data, session_id): trigger = ctx.triggered_id logging.info(f"handle_upload: Triggered by {trigger}, session_id={session_id}") if not session_id: session_id = str(uuid.uuid4()) logging.info(f"handle_upload: Generated new session_id {session_id}") flask.g.session_id = session_id session_dir = get_session_dir(session_id) lock = get_session_lock(session_id) if session_data is None: session_data = {} if trigger == 'clear-session': logging.info(f"handle_upload: Clear session button pressed for {session_id}") clean_session(session_id) resp_data = {} logging.info(f"Session cleared for {session_id}") return "", True, get_split_results_placeholder(), resp_data delete_pressed = False if isinstance(trigger, dict) and trigger.get('type') == 'delete-upload-btn': delete_pressed = True if not delete_pressed and delete_upload_n_list is not None and len(delete_upload_n_list) > 0: if any(n is not None and n > 0 for n in delete_upload_n_list): delete_pressed = True if delete_pressed: orig_filename = session_data.get('orig_filename', '') pdf_path = os.path.join(session_dir, orig_filename) if os.path.exists(pdf_path): os.remove(pdf_path) logging.info(f"Deleted uploaded file {pdf_path} for session {session_id}") session_data = {} if os.path.exists(session_dir): for file in os.listdir(session_dir): os.remove(os.path.join(session_dir, file)) logging.info(f"Session files deleted for {session_id}") return "", True, get_split_results_placeholder(), {} delete_split_pressed = False delete_split_idx = None if isinstance(trigger, dict) and trigger.get('type') == 'delete-split-btn': delete_split_pressed = True delete_split_idx = trigger.get('index') if not delete_split_pressed and delete_split_n_list is not None and len(delete_split_n_list) > 0: for i, n in enumerate(delete_split_n_list): if n is not None and n > 0: delete_split_pressed = True delete_split_idx = i break if delete_split_pressed and session_data.get('split_files'): split_files = session_data['split_files'] if 0 <= delete_split_idx < len(split_files): del_file = split_files[delete_split_idx] file_path = del_file['path'] if os.path.exists(file_path): os.remove(file_path) logging.info(f"Deleted split file: {file_path} for session {session_id}") split_files = [f for i, f in enumerate(split_files) if i != delete_split_idx] session_data['split_files'] = split_files zip_path = os.path.join(session_dir, "split_files.zip") if split_files: make_zip_of_splits(split_files, session_dir) session_data['zip_ready'] = True logging.info(f"Regenerated ZIP file after split delete for session {session_id}") else: if os.path.exists(zip_path): os.remove(zip_path) logging.info(f"Deleted ZIP file as no splits remain for session {session_id}") session_data['zip_ready'] = False orig_filename = session_data.get('orig_filename', '') file_info = dbc.Row([ dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}), dbc.Col( dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'), width=3, style={'display': 'flex', 'justifyContent': 'end'} ) ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'}) results = html.Div([ html.H5("Split Files:"), get_split_files_ui(split_files, session_id) if split_files else html.Div("No split files remain."), html.Div( dbc.Button( "Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4', id="download-zip-btn" ), style={'marginTop': '30px', 'display': 'block' if split_files else 'none'} ), ], id="split-results-inner") return file_info, False, results, session_data else: logging.warning(f"Split file delete index {delete_split_idx} invalid for session {session_id}") if trigger == 'upload-pdf': logging.info(f"handle_upload: Upload triggered for filename={filename}, session_id={session_id}") if not contents: logging.warning("No contents received in upload.") return "", True, get_split_results_placeholder(), {} if not allowed_file(filename): logging.warning(f"Disallowed file attempted upload: {filename}") return html.Div("Only .pdf files are allowed.", style={'color': 'red'}), True, get_split_results_placeholder(), {} try: header, b64data = contents.split(',', 1) pdf_bytes = base64.b64decode(b64data) pdf_path = os.path.join(session_dir, filename) with open(pdf_path, 'wb') as f: f.write(pdf_bytes) logging.info(f"PDF uploaded and saved to {pdf_path} for session {session_id}") session_data = { 'orig_filename': filename, 'split_files': None, 'zip_ready': False, } file_info = dbc.Row([ dbc.Col(html.Div(f"Uploaded: {filename} ({len(pdf_bytes)/1024/1024:.2f} MB)"), width=9, style={'display': 'flex', 'alignItems': 'center'}), dbc.Col( dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'), width=3, style={'display': 'flex', 'justifyContent': 'end'} ) ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'}) logging.info(f"handle_upload: File info UI updated, split button enabled.") return file_info, False, get_split_results_placeholder(), session_data except Exception as e: logging.error(f"Error processing PDF: {e}") return html.Div(f"Error: {e}", style={'color': 'red'}), True, get_split_results_placeholder(), {} if trigger == 'split-btn': orig_filename = session_data.get('orig_filename') logging.info(f"handle_upload: Split button clicked for {session_id}, orig_filename={orig_filename}") if not orig_filename: logging.error(f"Split button clicked but no file to split for session {session_id}") return html.Div("No file to split.", style={'color': 'red'}), True, get_split_results_placeholder(), session_data pdf_path = os.path.join(session_dir, orig_filename) if not os.path.exists(pdf_path): logging.error(f"Split button clicked but uploaded file not found for session {session_id}") return html.Div("Uploaded file not found. Please upload again.", style={'color': 'red'}), True, get_split_results_placeholder(), {} try: logging.info(f"Splitting PDF for session {session_id}. File: {pdf_path}") with lock: split_files = intelligent_pdf_split(pdf_path, session_dir) for fi in split_files: logging.info(f"Split file saved: {fi['path']} ({fi['size']:.2f} MB)") zip_path = make_zip_of_splits(split_files, session_dir) logging.info(f"Split/ZIP finished for {session_id}, zip_path={zip_path}") session_data['split_files'] = split_files session_data['zip_ready'] = True file_info = dbc.Row([ dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}), dbc.Col( dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'), width=3, style={'display': 'flex', 'justifyContent': 'end'} ) ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'}) split_files_list = get_split_files_ui(split_files, session_id) download_zip_btn = dbc.Button( "Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4', id="download-zip-btn" ) results = html.Div([ html.H5("Split Files:"), split_files_list, html.Div(download_zip_btn, style={'marginTop': '30px'}) ], id="split-results-inner") logging.info(f"PDF split into {len(split_files)} chunks for session {session_id}, zip ready.") return file_info, False, results, session_data except Exception as e: logging.error(f"Error splitting PDF for session {session_id}: {e}") return html.Div(f"Error: {e}", style={'color': 'red'}), False, get_split_results_placeholder(), session_data if session_data.get('split_files'): split_files = session_data['split_files'] orig_filename = session_data.get('orig_filename', '') file_info = dbc.Row([ dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}), dbc.Col( dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'), width=3, style={'display': 'flex', 'justifyContent': 'end'} ) ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'}) split_files_list = get_split_files_ui(split_files, session_id) download_zip_btn = dbc.Button( "Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4', id="download-zip-btn" ) results = html.Div([ html.H5("Split Files:"), split_files_list, html.Div(download_zip_btn, style={'marginTop': '30px'}) ], id="split-results-inner") logging.info(f"handle_upload: Restoring split results for session {session_id}, {len(split_files)} files.") return file_info, False, results, session_data if session_data.get('orig_filename') and not session_data.get('split_files'): file_info = dbc.Row([ dbc.Col(html.Div(f"Uploaded: {session_data['orig_filename']}"), width=9, style={'display': 'flex', 'alignItems': 'center'}), dbc.Col( dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'), width=3, style={'display': 'flex', 'justifyContent': 'end'} ) ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'}) logging.info(f"handle_upload: Restoring view after upload, split button enabled.") return file_info, False, get_split_results_placeholder(), session_data logging.info(f"handle_upload: No action taken, returning current session_data for session {session_id}") return "", True, get_split_results_placeholder(), session_data @app.callback( Output("download-zip-dcc", "data"), Input("download-zip-btn", "n_clicks"), State('session-id-store', 'data'), prevent_initial_call=True ) def trigger_download_zip(n_clicks, session_id): if not n_clicks or not session_id: return dash.no_update session_dir = get_session_dir(session_id) zip_path = os.path.join(session_dir, "split_files.zip") if os.path.exists(zip_path): logging.info(f"trigger_download_zip: Sending zip {zip_path} for session {session_id}") return dcc.send_file(zip_path) else: logging.error(f"trigger_download_zip: Zip file not found for session {session_id}") return dash.no_update @app.server.route('/download_zip//') def download_zip_file(session_id, filename): session_dir = get_session_dir(session_id) file_path = os.path.join(session_dir, filename) if os.path.exists(file_path): logging.info(f"Serving zip file {file_path} for session {session_id}") return send_file(file_path, mimetype='application/zip', as_attachment=True, download_name=filename) else: logging.error(f"ZIP file not found for download: {file_path}") return "File not found", 404 @app.server.route('/download_split//') def download_split_file(session_id, filename): session_dir = get_session_dir(session_id) file_path = os.path.join(session_dir, filename) if os.path.exists(file_path): logging.info(f"Serving split file {file_path} for session {session_id}") return send_file(file_path, mimetype='application/pdf', as_attachment=True, download_name=filename) else: logging.error(f"Split file not found for download: {file_path}") return "File not found", 404 if __name__ == '__main__': print("Starting the Dash application...") app.run(debug=False, host='0.0.0.0', port=7860, threaded=True) print("Dash application has finished running.")