Spaces:
Runtime error
Runtime error
File size: 4,164 Bytes
87d1db7 70b87d3 d3777b4 87d1db7 2b00e5d 725126b ca2c678 725126b 2b00e5d 725126b 2b00e5d 87d1db7 2b00e5d 712a7bd 725126b 712a7bd 725126b 712a7bd 725126b 2b00e5d 87d1db7 725126b 2b00e5d 712a7bd 87d1db7 ca2c678 2b00e5d ca2c678 725126b 2b00e5d ca2c678 725126b 2b00e5d 712a7bd ca2c678 2b00e5d ca2c678 87d1db7 ca2c678 87d1db7 725126b 70b87d3 2b00e5d 70b87d3 2b00e5d 70b87d3 2b00e5d 725126b 87d1db7 70b87d3 2b00e5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import subprocess
subprocess.check_call(['pip', 'install', 'camelot-py[cv]']) # Install Camelot with OpenCV support
import streamlit as st
import pdfplumber
import pandas as pd
import json
def extract_tables_with_pdfplumber(file):
tables = []
with pdfplumber.open(file) as pdf:
for page_number, page in enumerate(pdf.pages, 1): # Start page numbers at 1
extracted_tables = page.extract_tables()
for table in extracted_tables:
if table: # Check if a table was actually extracted
tables.append({
'page_number': page_number,
'data': table
})
return tables
def process_tables(tables):
def create_dataframe(table_data):
# Handle cases where headers are missing or inconsistent
if not table_data or not table_data[0]:
return pd.DataFrame() # Return an empty DataFrame if no headers are found
headers = table_data[0]
unique_headers = []
for header in headers:
if header in unique_headers:
new_header = f"{header}_{len(unique_headers)}"
unique_headers.append(new_header)
else:
unique_headers.append(header)
return pd.DataFrame(table_data[1:], columns=unique_headers)
all_tables_df = []
for table_data in tables:
df = create_dataframe(table_data['data'])
if not df.empty:
all_tables_df.append(df)
if not all_tables_df:
return None, None, {}
table1_df = pd.concat(all_tables_df[:-1], ignore_index=True) # All but the last table
table1_filename = "table1.csv"
table1_df.to_csv(table1_filename, index=False)
table2_df = all_tables_df[-1] # The last table
table2_filename = "table2.csv"
table2_df.to_csv(table2_filename, index=False)
context = {
'tables_extracted': len(tables),
'table1': {
'description': 'Table 1 (potentially multi-page)',
'pages': [t['page_number'] for t in tables if not t['data'][0] == tables[-1]['data'][0] ],
'csv_path': table1_filename,
'rows': table1_df.shape[0],
'columns': table1_df.shape[1]
},
'table2': {
'description': 'Table 2 (last page)',
'pages': tables[-1]['page_number'],
'csv_path': table2_filename,
'rows': table2_df.shape[0],
'columns': table2_df.shape[1]
}
}
context_filename = "tables_context.json"
with open(context_filename, 'w') as json_file:
json.dump(context, json_file, indent=4)
return table1_filename, table2_filename, context_filename
# Streamlit Web App
st.title("PDF Table Extractor with pdfplumber")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
try:
tables = extract_tables_with_pdfplumber(uploaded_file)
if not tables:
st.error("No tables were extracted from the PDF.")
else:
table1, table2, context = process_tables(tables)
if table1 and table2:
st.success("Extraction complete.")
# Provide download links for the output files
st.download_button(
label="Download Table 1 CSV",
data=open(table1, 'r').read(),
file_name=table1,
mime='text/csv'
)
st.download_button(
label="Download Table 2 CSV",
data=open(table2, 'r').read(),
file_name=table2,
mime='text/csv'
)
st.download_button(
label="Download Context JSON",
data=open(context, 'r').read(),
file_name=context,
mime='application/json'
)
else:
st.error("Not enough tables found in PDF. Minimum of 2 tables are needed.")
except Exception as e:
st.error(f"An error occurred: {e}")
|