AreejMehboob's picture
Update src/streamlit_app.py
b9e9522 verified
import io
import streamlit as st
import requests
import time
import os
from pathlib import Path
import glob
import base64
import pandas as pd
from datetime import datetime
# Configure page
st.set_page_config(
page_title="PDF Parser - Table Extraction Tool",
page_icon="πŸ“‹",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for styling - Grey and White Theme
st.markdown("""
<style>
.main-header {
text-align: center;
padding: 2rem 0;
background: linear-gradient(135deg, #6c757d 0%, #495057 100%);
border-radius: 10px;
margin-bottom: 2rem;
color: white;
}
.feature-card {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
text-align: center;
margin: 1rem 0;
border: 1px solid #dee2e6;
}
.demo-button {
background: linear-gradient(45deg, #6c757d, #495057);
color: white;
border: none;
padding: 12px 24px;
border-radius: 25px;
font-weight: bold;
cursor: pointer;
margin: 10px;
}
.upload-button {
background: #495057;
color: white;
border: none;
padding: 12px 24px;
border-radius: 25px;
font-weight: bold;
cursor: pointer;
margin: 10px;
}
.success-message {
background: #f8f9fa;
color: #495057;
padding: 15px;
border-radius: 5px;
border-left: 4px solid #6c757d;
margin: 20px 0;
}
.processing-message {
background: #f8f9fa;
color: #495057;
padding: 15px;
border-radius: 5px;
border-left: 4px solid #adb5bd;
margin: 20px 0;
}
.method-tab {
background: #f8f9fa;
padding: 10px 15px;
border-radius: 5px;
margin: 5px;
cursor: pointer;
border: 2px solid #dee2e6;
}
.method-tab-active {
background: #6c757d;
color: white;
border: 2px solid #495057;
}
.html-file-card {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
margin: 10px 0;
border-left: 4px solid #6c757d;
}
.file-info-card {
background: #f8f9fa;
padding: 12px;
border-radius: 8px;
margin: 5px 0;
border-left: 4px solid #6c757d;
font-size: 0.9em;
}
.file-stats {
color: #6c757d;
font-size: 0.85em;
margin-top: 5px;
}
.stSelectbox > div > div {
background-color: #f8f9fa;
}
.hidden-text {
color: #adb5bd;
font-style: italic;
}
.table-container {
max-height: 400px;
overflow-y: auto;
border: 1px solid #dee2e6;
border-radius: 5px;
padding: 10px;
margin: 10px 0;
background-color: white;
}
.table-header {
background: #f8f9fa;
padding: 10px;
border-radius: 5px;
margin-bottom: 10px;
border-left: 4px solid #6c757d;
}
/* Override Streamlit button styles */
.stButton > button {
background-color: #6c757d !important;
color: white !important;
border: 1px solid #495057 !important;
border-radius: 5px !important;
}
.stButton > button:hover {
background-color: #495057 !important;
border-color: #343a40 !important;
}
/* Override primary button styles */
.stButton > button[kind="primary"] {
background-color: #495057 !important;
color: white !important;
border: 1px solid #343a40 !important;
}
.stButton > button[kind="primary"]:hover {
background-color: #343a40 !important;
}
/* Style checkboxes */
.stCheckbox > label {
color: #495057 !important;
}
/* Style text inputs */
.stTextInput > div > div > input {
background-color: #f8f9fa !important;
border-color: #dee2e6 !important;
}
/* Style file uploader */
.stFileUploader > div {
background-color: #f8f9fa !important;
border-color: #dee2e6 !important;
}
/* Style dataframes */
.stDataFrame {
background-color: white !important;
border: 1px solid #dee2e6 !important;
}
/* Style selectbox */
.stSelectbox > div > div {
background-color: #f8f9fa !important;
border-color: #dee2e6 !important;
}
/* Style progress bar */
.stProgress > div > div > div {
background-color: #6c757d !important;
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'page' not in st.session_state:
st.session_state.page = 'home'
if 'processing' not in st.session_state:
st.session_state.processing = False
if 'results' not in st.session_state:
st.session_state.results = None
if 'show_output_dir' not in st.session_state:
st.session_state.show_output_dir = False
if 'selected_method' not in st.session_state:
st.session_state.selected_method = None
if 'demo_results' not in st.session_state:
st.session_state.demo_results = None
if 'demo_selected_methods' not in st.session_state:
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
# Get the directory where the script is located (src)
SCRIPT_DIR = Path(__file__).parent
# Tesla demo document path (assuming it's in the src directory or adjust as needed)
TESLA_DOC_PATH = SCRIPT_DIR / "tesla_docs_28-41 (1)-9-14.pdf"
# Output directory is src/output
OUTPUT_BASE_PATH = SCRIPT_DIR / "output"
def show_home_page():
# Header
st.markdown("""
<div class="main-header">
<h1 style="font-size: 3rem; margin: 0; color: #f8f9fa;">Transform PDF Tables to</h1>
<h1 style="font-size: 3rem; margin: 0; color: #ffffff;">HTML and Excel</h1>
<p style="margin-top: 1rem; font-size: 1.2rem; opacity: 0.9;">Powered by Traversaal.ai</p>
<p style="margin-top: 0.5rem; opacity: 0.8;">Perfect for financial reports, research papers, and data analysis.</p>
</div>
""", unsafe_allow_html=True)
# Main buttons
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
col_btn1, col_btn2 = st.columns(2)
with col_btn1:
if st.button("πŸ“„ Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
st.session_state.page = 'upload'
st.rerun()
with col_btn2:
if st.button("⚑ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
st.session_state.page = 'demo_setup'
st.rerun()
# Features section
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("""
<div class="feature-card">
<h3 style="color: #495057;">⚑ Lightning Fast</h3>
<p style="color: #6c757d;">Process complex PDFs in seconds with our advanced AI algorithms</p>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown("""
<div class="feature-card">
<h3 style="color: #495057;">πŸ”’ Secure & Private</h3>
<p style="color: #6c757d;">Your documents are processed securely and never stored permanently</p>
</div>
""", unsafe_allow_html=True)
with col3:
st.markdown("""
<div class="feature-card">
<h3 style="color: #495057;">πŸ”„ Batch Processing</h3>
<p style="color: #6c757d;">Handle multiple documents and tables simultaneously</p>
</div>
""", unsafe_allow_html=True)
def show_upload_page():
st.markdown("## πŸ“„ Upload Your Document")
# File upload
uploaded_file = st.file_uploader(
"Choose a PDF file",
type=['pdf'],
help="Upload a PDF document to extract tables from"
)
# Input file path (alternative)
st.markdown("**Or specify file path:**")
input_file_path = st.text_input(
"Input File Path",
placeholder="C:\\path\\to\\your\\document.pdf",
help="Enter the full path to your PDF file"
)
# Output directory with show/hide functionality
output_dir = st.text_input(
"Output Directory",
placeholder="C:\\path\\to\\output\\folder",
help="Directory where extracted tables will be saved",
type="password" if not st.session_state.show_output_dir else "default"
)
# Show/Hide output directory toggle
col1, col2 = st.columns([3, 1])
with col2:
if st.button("πŸ‘οΈ View/Hide Path"):
st.session_state.show_output_dir = not st.session_state.show_output_dir
st.rerun()
# Extraction method selection
st.markdown("### πŸ”§ Select Extraction Methods")
col1, col2, col3 = st.columns(3)
with col1:
docling = st.checkbox("Docling", value=True, help="Advanced document processing")
with col2:
llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing")
with col3:
unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction")
# Process button
if st.button("πŸš€ Process Document", type="primary"):
if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured):
file_path = input_file_path if input_file_path else uploaded_file.name
process_document(file_path, output_dir, docling, llamaparse, unstructured)
else:
st.error("Please provide input file, output directory, and select at least one extraction method.")
# Back button
if st.button("← Back to Home"):
st.session_state.page = 'home'
st.rerun()
def show_demo_setup_page():
st.markdown("## ⚑ Tesla 10K Demo Setup")
st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
# Document info
st.markdown("### πŸ“„ Document Information")
st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
# Extraction method selection (removed output directory section completely)
st.markdown("### πŸ”§ Select Extraction Methods")
col1, col2, col3 = st.columns(3)
with col1:
docling = st.checkbox("Docling",
value=st.session_state.demo_selected_methods['docling'],
help="Advanced document processing")
with col2:
llamaparse = st.checkbox("LlamaParse",
value=st.session_state.demo_selected_methods['llamaparse'],
help="AI-powered parsing")
with col3:
unstructured = st.checkbox("Unstructured",
value=st.session_state.demo_selected_methods['unstructured'],
help="General purpose extraction")
# Update session state
st.session_state.demo_selected_methods = {
'docling': docling,
'llamaparse': llamaparse,
'unstructured': unstructured
}
# Process button
col1, col2 = st.columns([2, 1])
with col1:
if st.button("πŸš€ Process Tesla Document", type="primary"):
if docling or llamaparse or unstructured:
st.session_state.page = 'demo'
st.session_state.processing = True
st.rerun()
else:
st.error("Please select at least one extraction method.")
with col2:
if st.button("← Back to Home"):
st.session_state.page = 'home'
st.rerun()
def show_demo_page():
if st.session_state.processing:
show_processing_demo()
else:
show_demo_results()
def show_processing_demo():
st.markdown("## ⚑ Processing Tesla 10K Document...")
# Show selected methods
selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*")
# Progress bar
progress_bar = st.progress(0)
status_text = st.empty()
method_status = st.empty()
# Calculate total steps based on selected methods
total_methods = len(selected_methods)
steps_per_method = 30
total_steps = total_methods * steps_per_method
current_method_index = 0
for i in range(total_steps):
progress = (i + 1) / total_steps
progress_bar.progress(progress)
# Determine current method
method_step = i % steps_per_method
if method_step == 0 and i > 0:
current_method_index += 1
current_method = selected_methods[current_method_index]
method_progress = (method_step + 1) / steps_per_method
# Update status messages
if method_progress < 0.3:
status_text.text(f"πŸ“„ {current_method.title()}: Reading document... {int(method_progress * 100)}%")
elif method_progress < 0.7:
status_text.text(f"πŸ” {current_method.title()}: Extracting tables... {int(method_progress * 100)}%")
else:
status_text.text(f"πŸ’Ύ {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%")
method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
time.sleep(0.33)
# Show completion
st.markdown("""
<div class="success-message">
βœ… <strong>Document processed successfully!</strong><br>
Tables have been extracted using selected methods and HTML files are ready for viewing.
</div>
""", unsafe_allow_html=True)
# Process Tesla demo
process_tesla_demo()
st.session_state.processing = False
time.sleep(2)
st.rerun()
def process_tesla_demo():
"""Process Tesla demo document using selected extraction methods"""
try:
# Create output directory for demo (using the base path)
demo_output_dir = OUTPUT_BASE_PATH / "tesla_demo"
# Prepare the request data for selected methods only
data = {
'input_file_path': str(TESLA_DOC_PATH),
'output_dir': str(demo_output_dir),
'docling': st.session_state.demo_selected_methods['docling'],
'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
'unstructured': st.session_state.demo_selected_methods['unstructured']
}
# Make request to FastAPI endpoint (uncomment when ready)
# response = requests.post('http://localhost:8000/extract', data=data)
# if response.status_code == 200:
# st.session_state.demo_results = response.json()
# For demo purposes, simulate successful processing for selected methods only
results = {}
if st.session_state.demo_selected_methods['docling']:
results['docling'] = {'status': 'success', 'total_tables': 5}
if st.session_state.demo_selected_methods['llamaparse']:
results['llamaparse'] = {'status': 'success', 'total_tables': 3}
if st.session_state.demo_selected_methods['unstructured']:
results['unstructured'] = {'status': 'success', 'total_tables': 4}
st.session_state.demo_results = {'results': results}
except Exception as e:
st.error(f"Error processing Tesla demo: {str(e)}")
def count_html_files(directory):
"""Count only HTML files in directory"""
if not os.path.exists(directory):
return 0
html_files = glob.glob(os.path.join(str(directory), "*.html"))
html_files.extend(glob.glob(os.path.join(str(directory), "**", "*.html"), recursive=True))
return len(html_files)
def get_excel_files(directory):
"""Get all Excel files from directory"""
if not os.path.exists(directory):
return []
excel_files = glob.glob(os.path.join(str(directory), "*.xlsx"))
excel_files.extend(glob.glob(os.path.join(str(directory), "*.xls")))
excel_files.extend(glob.glob(os.path.join(str(directory), "*.csv")))
excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xlsx"), recursive=True))
excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xls"), recursive=True))
return excel_files
def get_file_info(file_path):
"""Get file information including size and modification time"""
if not os.path.exists(file_path):
return {"size": 0, "modified": "Unknown"}
stat = os.stat(file_path)
size_kb = stat.st_size / 1024
modified = datetime.fromtimestamp(stat.st_mtime)
return {
"size": f"{size_kb:.1f} KB",
"modified": modified.strftime("%Y-%m-%d %H:%M")
}
def show_demo_results():
st.markdown("## πŸ“Š Tesla 10K Processing Results")
# Document info
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("### πŸ“„ tesla_docs_28-41 (1)-9-14.pdf")
st.markdown("**Status:** βœ… Complete")
processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
with col2:
if st.button("πŸ”„ Reset"):
st.session_state.page = 'home'
st.session_state.processing = False
st.session_state.results = None
st.session_state.demo_results = None
st.session_state.selected_method = None
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
st.rerun()
# Method selection tabs - only show selected methods
available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
if len(available_methods) > 1:
st.markdown("### πŸ”§ Select Extraction Method to View")
method_labels = {
'docling': 'πŸ”§ Docling',
'llamaparse': 'πŸ¦™ LlamaParse',
'unstructured': 'πŸ“Š Unstructured'
}
# Create columns based on number of available methods
cols = st.columns(len(available_methods))
for i, method in enumerate(available_methods):
with cols[i]:
# Show HTML file count for each method using the same logic as show_html_tables
method_output_dir = OUTPUT_BASE_PATH / method
html_files = []
if os.path.exists(method_output_dir):
html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
html_files = list(set(html_files))
html_count = len(html_files)
button_label = f"{method_labels[method]} ({html_count} HTML files)"
if st.button(button_label, key=f"tab_{method}", use_container_width=True):
st.session_state.selected_method = method
# Default to first available method if no method selected
if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
st.session_state.selected_method = available_methods[0] if available_methods else None
# Show results for selected method
if st.session_state.selected_method:
show_method_results(st.session_state.selected_method)
def show_method_results(method):
st.markdown(f"### πŸ“‹ Results from {method.title()}")
# Changed column ratio: 3:1 for HTML tables:Excel files
col1, col2 = st.columns([3, 1])
with col1:
st.markdown("#### πŸ“„ HTML Tables")
show_html_tables(method)
with col2:
st.markdown("#### πŸ“Š Excel Files")
show_excel_files(method)
def show_html_tables(method):
"""Display HTML tables from the method's output directory"""
method_output_dir = OUTPUT_BASE_PATH / method
# Get actual HTML files from directory
html_files = []
if os.path.exists(method_output_dir):
# Use only the recursive glob, which includes the top-level directory
html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
# Remove duplicates just in case
html_files = list(set(html_files))
# Sort files by table number if possible (e.g., table_1, table_2, ...)
import re
def extract_table_number(filename):
match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
if match:
return int(match.group(1))
return float('inf') # Put files without a number at the end
html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
if html_files:
st.markdown(f"**Found {len(html_files)} HTML table(s):**")
# Display all HTML files in one scrollable container
st.markdown('<div class="table-container">', unsafe_allow_html=True)
for i, html_file in enumerate(html_files):
st.markdown(f"""
<div class="table-header">
<h4 style="color: #495057;">πŸ“‹ Table {i+1}</h4>
<small style="color: #6c757d;">File: {os.path.basename(html_file)}</small>
</div>
""", unsafe_allow_html=True)
# Display HTML content
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.components.v1.html(html_content, height=300, scrolling=True)
except Exception as e:
st.error(f"Error displaying HTML file: {e}")
# Download button for individual HTML file
col_download1, col_download2, col_download3 = st.columns([1, 1, 2])
with col_download1:
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.download_button(
label=f"⬇️ Table {i+1}",
data=html_content,
file_name=f"table_{i+1}_{method}.html",
mime="text/html",
key=f"download_html_{method}_{i}",
use_container_width=True
)
except Exception as e:
st.error(f"Error reading file for download: {e}")
if i < len(html_files) - 1:
st.markdown("---")
st.markdown('</div>', unsafe_allow_html=True)
else:
st.warning(f"No HTML files found in {method_output_dir}")
def show_excel_files(method):
"""Display Excel files from the method's output directory"""
method_output_dir = OUTPUT_BASE_PATH / method
# Get actual Excel files from directory
excel_files = get_excel_files(method_output_dir)
if excel_files:
st.markdown(f"**Found {len(excel_files)} Excel file(s):**")
for i, excel_file in enumerate(excel_files):
# Get file info
file_info = get_file_info(excel_file)
file_name = os.path.basename(excel_file)
# File info card
st.markdown(f"""
<div class="file-info-card">
<strong style="color: #495057;">πŸ“Š {file_name}</strong>
<div class="file-stats">
<strong>Size:</strong> {file_info['size']}<br>
<strong>Modified:</strong> {file_info['modified']}
</div>
</div>
""", unsafe_allow_html=True)
# Try to read and display Excel file preview
try:
df = pd.read_excel(excel_file)
if not df.empty:
st.markdown(f"**Preview (first 5 rows):**")
st.dataframe(df.head(), use_container_width=True)
st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
else:
st.info("Excel file is empty")
except Exception as e:
# Try reading as CSV if Excel reading fails
try:
df = pd.read_csv(excel_file)
if not df.empty:
st.markdown(f"**Preview (first 5 rows, read as CSV):**")
st.dataframe(df.head(), use_container_width=True)
st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
else:
st.info("CSV file is empty")
except Exception as e2:
st.warning(f"Could not preview file as Excel or CSV: {e2}")
# Download button for Excel file
try:
with open(excel_file, 'rb') as f:
excel_data = f.read()
st.download_button(
label=f"⬇️ Download",
data=excel_data,
file_name=file_name,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
key=f"download_excel_{method}_{i}",
use_container_width=True
)
except Exception as e:
st.error(f"Error reading Excel file for download: {e}")
if i < len(excel_files) - 1:
st.markdown("---")
else:
st.warning(f"No Excel files found in {method_output_dir}")
def process_document(file_path, output_dir, docling, llamaparse, unstructured):
"""Process document using the FastAPI endpoint"""
try:
# Prepare the request data
data = {
'input_file_path': file_path,
'output_dir': output_dir,
'docling': docling,
'llamaparse': llamaparse,
'unstructured': unstructured
}
# Show processing message
with st.spinner('Processing document...'):
# Make request to FastAPI endpoint
# Replace with your actual FastAPI endpoint URL
response = requests.post('http://localhost:8000/extract', data=data)
if response.status_code == 200:
st.session_state.results = response.json()
st.success("Document processed successfully!")
# Show results
results = st.session_state.results['results']
# Method selection for viewing results
st.markdown("### πŸ“Š View Results")
available_methods = [method for method in ['docling', 'llamaparse', 'unstructured']
if method in results and isinstance(results[method], dict)]
if available_methods:
selected_method = st.selectbox(
"Select extraction method to view:",
available_methods,
help="Choose which extraction method results to display"
)
if selected_method and isinstance(results[selected_method], dict):
method_result = results[selected_method]
st.json(method_result)
# List files in output directory
method_dir = os.path.join(output_dir, selected_method)
# HTML files
html_files = glob.glob(os.path.join(method_dir, "*.html"))
html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
# Excel files
excel_files = get_excel_files(method_dir)
if html_files or excel_files:
st.markdown("### πŸ“„ Generated Files")
if html_files:
st.markdown("**HTML Files:**")
for html_file in html_files:
st.markdown(f"- {os.path.basename(html_file)}")
if excel_files:
st.markdown("**Excel Files:**")
for excel_file in excel_files:
st.markdown(f"- {os.path.basename(excel_file)}")
else:
st.warning("No successful extractions found.")
else:
st.error(f"Error processing document: {response.text}")
except requests.exceptions.ConnectionError:
st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
def main():
# Navigation header
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("### πŸ“‹ PDF Parser")
st.markdown("*Table Extraction Tool*")
with col2:
nav_col1, nav_col2 = st.columns(2)
with nav_col1:
if st.button("Dashboard", use_container_width=True):
st.session_state.page = 'home'
st.rerun()
with nav_col2:
st.button("History", use_container_width=True)
st.markdown("---")
# Route to appropriate page
if st.session_state.page == 'home':
show_home_page()
elif st.session_state.page == 'upload':
show_upload_page()
elif st.session_state.page == 'demo_setup':
show_demo_setup_page()
elif st.session_state.page == 'demo':
show_demo_page()
if __name__ == "__main__":
main()