Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,7 +11,6 @@ def extract_tables(file_path):
|
|
| 11 |
with pdfplumber.open(file_path) as pdf:
|
| 12 |
for page_number, page in enumerate(pdf.pages, start=1):
|
| 13 |
extracted_tables = page.extract_tables()
|
| 14 |
-
|
| 15 |
for table_index, table in enumerate(extracted_tables):
|
| 16 |
df = pd.DataFrame(table[1:], columns=table[0]) # Use the first row as headers
|
| 17 |
table_info = {
|
|
@@ -23,10 +22,7 @@ def extract_tables(file_path):
|
|
| 23 |
return all_tables
|
| 24 |
|
| 25 |
def aggregate_and_save_tables(tables_list):
|
| 26 |
-
json_metadata = {
|
| 27 |
-
'tables': [],
|
| 28 |
-
'total_tables': len(tables_list)
|
| 29 |
-
}
|
| 30 |
current_combined_table = None
|
| 31 |
combined_csv_files = []
|
| 32 |
|
|
@@ -34,15 +30,17 @@ def aggregate_and_save_tables(tables_list):
|
|
| 34 |
page_number = table_info['page_number']
|
| 35 |
df = table_info['dataframe']
|
| 36 |
|
| 37 |
-
#
|
| 38 |
if current_combined_table is None:
|
| 39 |
current_combined_table = df
|
| 40 |
else:
|
| 41 |
-
#
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
|
| 44 |
else:
|
| 45 |
-
# Save
|
| 46 |
file_name = f"combined_table_page_{page_number - 1}.csv"
|
| 47 |
current_combined_table.to_csv(file_name, index=False)
|
| 48 |
combined_csv_files.append(file_name)
|
|
@@ -51,10 +49,10 @@ def aggregate_and_save_tables(tables_list):
|
|
| 51 |
'description': f"Combined table up to page {page_number - 1}",
|
| 52 |
'rows': current_combined_table.shape[0]
|
| 53 |
})
|
| 54 |
-
# Start a new
|
| 55 |
current_combined_table = df
|
| 56 |
|
| 57 |
-
# Save the last combined table if exists
|
| 58 |
if current_combined_table is not None:
|
| 59 |
file_name = f"combined_table_final.csv"
|
| 60 |
current_combined_table.to_csv(file_name, index=False)
|
|
@@ -77,22 +75,19 @@ st.title("PDF Table Extractor for Combined Tables")
|
|
| 77 |
|
| 78 |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
|
| 79 |
if uploaded_file is not None:
|
| 80 |
-
# Save the uploaded file temporarily
|
| 81 |
temp_file_path = "temp_uploaded_file.pdf"
|
| 82 |
|
| 83 |
with open(temp_file_path, "wb") as f:
|
| 84 |
f.write(uploaded_file.getbuffer())
|
| 85 |
|
| 86 |
try:
|
| 87 |
-
# Extract tables and process into combined CSVs and a JSON metadata file
|
| 88 |
extracted_tables = extract_tables(temp_file_path)
|
| 89 |
-
os.remove(temp_file_path)
|
| 90 |
-
|
| 91 |
combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
|
| 92 |
|
| 93 |
st.success("Table extraction and aggregation complete.")
|
| 94 |
|
| 95 |
-
# Provide download links for each generated CSV file
|
| 96 |
for csv_file in combined_csvs:
|
| 97 |
st.download_button(
|
| 98 |
label=f"Download {csv_file}",
|
|
@@ -101,7 +96,6 @@ if uploaded_file is not None:
|
|
| 101 |
mime='text/csv'
|
| 102 |
)
|
| 103 |
|
| 104 |
-
# Provide download link for the JSON metadata file
|
| 105 |
st.download_button(
|
| 106 |
label="Download JSON Metadata",
|
| 107 |
data=open(metadata_file, 'r').read(),
|
|
|
|
| 11 |
with pdfplumber.open(file_path) as pdf:
|
| 12 |
for page_number, page in enumerate(pdf.pages, start=1):
|
| 13 |
extracted_tables = page.extract_tables()
|
|
|
|
| 14 |
for table_index, table in enumerate(extracted_tables):
|
| 15 |
df = pd.DataFrame(table[1:], columns=table[0]) # Use the first row as headers
|
| 16 |
table_info = {
|
|
|
|
| 22 |
return all_tables
|
| 23 |
|
| 24 |
def aggregate_and_save_tables(tables_list):
|
| 25 |
+
json_metadata = {'tables': [], 'total_tables': len(tables_list)}
|
|
|
|
|
|
|
|
|
|
| 26 |
current_combined_table = None
|
| 27 |
combined_csv_files = []
|
| 28 |
|
|
|
|
| 30 |
page_number = table_info['page_number']
|
| 31 |
df = table_info['dataframe']
|
| 32 |
|
| 33 |
+
# Check if headers match for logical table continuation
|
| 34 |
if current_combined_table is None:
|
| 35 |
current_combined_table = df
|
| 36 |
else:
|
| 37 |
+
# Ensure headers are aligned in structure
|
| 38 |
+
headers_match = len(current_combined_table.columns) == len(df.columns) and \
|
| 39 |
+
(current_combined_table.columns == df.columns).all()
|
| 40 |
+
if headers_match:
|
| 41 |
current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
|
| 42 |
else:
|
| 43 |
+
# Save the current table as CSV
|
| 44 |
file_name = f"combined_table_page_{page_number - 1}.csv"
|
| 45 |
current_combined_table.to_csv(file_name, index=False)
|
| 46 |
combined_csv_files.append(file_name)
|
|
|
|
| 49 |
'description': f"Combined table up to page {page_number - 1}",
|
| 50 |
'rows': current_combined_table.shape[0]
|
| 51 |
})
|
| 52 |
+
# Start a new table
|
| 53 |
current_combined_table = df
|
| 54 |
|
| 55 |
+
# Save the last combined table if it exists
|
| 56 |
if current_combined_table is not None:
|
| 57 |
file_name = f"combined_table_final.csv"
|
| 58 |
current_combined_table.to_csv(file_name, index=False)
|
|
|
|
| 75 |
|
| 76 |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
|
| 77 |
if uploaded_file is not None:
|
|
|
|
| 78 |
temp_file_path = "temp_uploaded_file.pdf"
|
| 79 |
|
| 80 |
with open(temp_file_path, "wb") as f:
|
| 81 |
f.write(uploaded_file.getbuffer())
|
| 82 |
|
| 83 |
try:
|
|
|
|
| 84 |
extracted_tables = extract_tables(temp_file_path)
|
| 85 |
+
os.remove(temp_file_path)
|
| 86 |
+
|
| 87 |
combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
|
| 88 |
|
| 89 |
st.success("Table extraction and aggregation complete.")
|
| 90 |
|
|
|
|
| 91 |
for csv_file in combined_csvs:
|
| 92 |
st.download_button(
|
| 93 |
label=f"Download {csv_file}",
|
|
|
|
| 96 |
mime='text/csv'
|
| 97 |
)
|
| 98 |
|
|
|
|
| 99 |
st.download_button(
|
| 100 |
label="Download JSON Metadata",
|
| 101 |
data=open(metadata_file, 'r').read(),
|