Guhanselvam's picture
Update app.py
725126b verified
import subprocess
subprocess.check_call(['pip', 'install', 'camelot-py[cv]']) # Install Camelot with OpenCV support
import streamlit as st
import pdfplumber
import pandas as pd
import json
def extract_tables_with_pdfplumber(file):
tables = []
with pdfplumber.open(file) as pdf:
for page_number, page in enumerate(pdf.pages, 1): # Start page numbers at 1
extracted_tables = page.extract_tables()
for table in extracted_tables:
if table: # Check if a table was actually extracted
tables.append({
'page_number': page_number,
'data': table
})
return tables
def process_tables(tables):
def create_dataframe(table_data):
# Handle cases where headers are missing or inconsistent
if not table_data or not table_data[0]:
return pd.DataFrame() # Return an empty DataFrame if no headers are found
headers = table_data[0]
unique_headers = []
for header in headers:
if header in unique_headers:
new_header = f"{header}_{len(unique_headers)}"
unique_headers.append(new_header)
else:
unique_headers.append(header)
return pd.DataFrame(table_data[1:], columns=unique_headers)
all_tables_df = []
for table_data in tables:
df = create_dataframe(table_data['data'])
if not df.empty:
all_tables_df.append(df)
if not all_tables_df:
return None, None, {}
table1_df = pd.concat(all_tables_df[:-1], ignore_index=True) # All but the last table
table1_filename = "table1.csv"
table1_df.to_csv(table1_filename, index=False)
table2_df = all_tables_df[-1] # The last table
table2_filename = "table2.csv"
table2_df.to_csv(table2_filename, index=False)
context = {
'tables_extracted': len(tables),
'table1': {
'description': 'Table 1 (potentially multi-page)',
'pages': [t['page_number'] for t in tables if not t['data'][0] == tables[-1]['data'][0] ],
'csv_path': table1_filename,
'rows': table1_df.shape[0],
'columns': table1_df.shape[1]
},
'table2': {
'description': 'Table 2 (last page)',
'pages': tables[-1]['page_number'],
'csv_path': table2_filename,
'rows': table2_df.shape[0],
'columns': table2_df.shape[1]
}
}
context_filename = "tables_context.json"
with open(context_filename, 'w') as json_file:
json.dump(context, json_file, indent=4)
return table1_filename, table2_filename, context_filename
# Streamlit Web App
st.title("PDF Table Extractor with pdfplumber")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
try:
tables = extract_tables_with_pdfplumber(uploaded_file)
if not tables:
st.error("No tables were extracted from the PDF.")
else:
table1, table2, context = process_tables(tables)
if table1 and table2:
st.success("Extraction complete.")
# Provide download links for the output files
st.download_button(
label="Download Table 1 CSV",
data=open(table1, 'r').read(),
file_name=table1,
mime='text/csv'
)
st.download_button(
label="Download Table 2 CSV",
data=open(table2, 'r').read(),
file_name=table2,
mime='text/csv'
)
st.download_button(
label="Download Context JSON",
data=open(context, 'r').read(),
file_name=context,
mime='application/json'
)
else:
st.error("Not enough tables found in PDF. Minimum of 2 tables are needed.")
except Exception as e:
st.error(f"An error occurred: {e}")