Spaces:
Runtime error
Runtime error
| import subprocess | |
| subprocess.check_call(['pip', 'install', 'camelot-py[cv]']) # Install Camelot with OpenCV support | |
| import streamlit as st | |
| import pdfplumber | |
| import pandas as pd | |
| import json | |
| def extract_tables_with_pdfplumber(file): | |
| tables = [] | |
| with pdfplumber.open(file) as pdf: | |
| for page_number, page in enumerate(pdf.pages, 1): # Start page numbers at 1 | |
| extracted_tables = page.extract_tables() | |
| for table in extracted_tables: | |
| if table: # Check if a table was actually extracted | |
| tables.append({ | |
| 'page_number': page_number, | |
| 'data': table | |
| }) | |
| return tables | |
| def process_tables(tables): | |
| def create_dataframe(table_data): | |
| # Handle cases where headers are missing or inconsistent | |
| if not table_data or not table_data[0]: | |
| return pd.DataFrame() # Return an empty DataFrame if no headers are found | |
| headers = table_data[0] | |
| unique_headers = [] | |
| for header in headers: | |
| if header in unique_headers: | |
| new_header = f"{header}_{len(unique_headers)}" | |
| unique_headers.append(new_header) | |
| else: | |
| unique_headers.append(header) | |
| return pd.DataFrame(table_data[1:], columns=unique_headers) | |
| all_tables_df = [] | |
| for table_data in tables: | |
| df = create_dataframe(table_data['data']) | |
| if not df.empty: | |
| all_tables_df.append(df) | |
| if not all_tables_df: | |
| return None, None, {} | |
| table1_df = pd.concat(all_tables_df[:-1], ignore_index=True) # All but the last table | |
| table1_filename = "table1.csv" | |
| table1_df.to_csv(table1_filename, index=False) | |
| table2_df = all_tables_df[-1] # The last table | |
| table2_filename = "table2.csv" | |
| table2_df.to_csv(table2_filename, index=False) | |
| context = { | |
| 'tables_extracted': len(tables), | |
| 'table1': { | |
| 'description': 'Table 1 (potentially multi-page)', | |
| 'pages': [t['page_number'] for t in tables if not t['data'][0] == tables[-1]['data'][0] ], | |
| 'csv_path': table1_filename, | |
| 'rows': table1_df.shape[0], | |
| 'columns': table1_df.shape[1] | |
| }, | |
| 'table2': { | |
| 'description': 'Table 2 (last page)', | |
| 'pages': tables[-1]['page_number'], | |
| 'csv_path': table2_filename, | |
| 'rows': table2_df.shape[0], | |
| 'columns': table2_df.shape[1] | |
| } | |
| } | |
| context_filename = "tables_context.json" | |
| with open(context_filename, 'w') as json_file: | |
| json.dump(context, json_file, indent=4) | |
| return table1_filename, table2_filename, context_filename | |
| # Streamlit Web App | |
| st.title("PDF Table Extractor with pdfplumber") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| if uploaded_file is not None: | |
| try: | |
| tables = extract_tables_with_pdfplumber(uploaded_file) | |
| if not tables: | |
| st.error("No tables were extracted from the PDF.") | |
| else: | |
| table1, table2, context = process_tables(tables) | |
| if table1 and table2: | |
| st.success("Extraction complete.") | |
| # Provide download links for the output files | |
| st.download_button( | |
| label="Download Table 1 CSV", | |
| data=open(table1, 'r').read(), | |
| file_name=table1, | |
| mime='text/csv' | |
| ) | |
| st.download_button( | |
| label="Download Table 2 CSV", | |
| data=open(table2, 'r').read(), | |
| file_name=table2, | |
| mime='text/csv' | |
| ) | |
| st.download_button( | |
| label="Download Context JSON", | |
| data=open(context, 'r').read(), | |
| file_name=context, | |
| mime='application/json' | |
| ) | |
| else: | |
| st.error("Not enough tables found in PDF. Minimum of 2 tables are needed.") | |
| except Exception as e: | |
| st.error(f"An error occurred: {e}") | |