Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

App Files Files Community

Guhanselvam commited on Oct 21, 2024

Commit

1c1eb47

verified ·

1 Parent(s): 512d90a

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -17

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ def extract_tables(file_path):
     with pdfplumber.open(file_path) as pdf:
         for page_number, page in enumerate(pdf.pages, start=1):
             extracted_tables = page.extract_tables()
             for table_index, table in enumerate(extracted_tables):
                 df = pd.DataFrame(table[1:], columns=table[0])  # Use the first row as headers
                 table_info = {
@@ -23,10 +22,7 @@ def extract_tables(file_path):
     return all_tables
 def aggregate_and_save_tables(tables_list):
-    json_metadata = {
-        'tables': [],
-        'total_tables': len(tables_list)
-    }
     current_combined_table = None
     combined_csv_files = []
@@ -34,15 +30,17 @@ def aggregate_and_save_tables(tables_list):
         page_number = table_info['page_number']
         df = table_info['dataframe']
-        # Logic to determine whether to continue the previous table or start a new one
         if current_combined_table is None:
             current_combined_table = df
         else:
-            # Simple heuristic: check if the current table's header matches header of the last table
-            if (current_combined_table.columns == df.columns).all():
                 current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
             else:
-                # Save current_combined_table to a CSV
                 file_name = f"combined_table_page_{page_number - 1}.csv"
                 current_combined_table.to_csv(file_name, index=False)
                 combined_csv_files.append(file_name)
@@ -51,10 +49,10 @@ def aggregate_and_save_tables(tables_list):
                     'description': f"Combined table up to page {page_number - 1}",
                     'rows': current_combined_table.shape[0]
                 })
-                # Start a new combined table
                 current_combined_table = df
-    # Save the last combined table if exists
     if current_combined_table is not None:
         file_name = f"combined_table_final.csv"
         current_combined_table.to_csv(file_name, index=False)
@@ -77,22 +75,19 @@ st.title("PDF Table Extractor for Combined Tables")
 uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if uploaded_file is not None:
-    # Save the uploaded file temporarily
     temp_file_path = "temp_uploaded_file.pdf"
     with open(temp_file_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     try:
-        # Extract tables and process into combined CSVs and a JSON metadata file
         extracted_tables = extract_tables(temp_file_path)
-        os.remove(temp_file_path)  # Clean up the temporary file
         combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
         st.success("Table extraction and aggregation complete.")
-        # Provide download links for each generated CSV file
         for csv_file in combined_csvs:
             st.download_button(
                 label=f"Download {csv_file}",
@@ -101,7 +96,6 @@ if uploaded_file is not None:
                 mime='text/csv'
             )
-        # Provide download link for the JSON metadata file
         st.download_button(
             label="Download JSON Metadata",
             data=open(metadata_file, 'r').read(),

     with pdfplumber.open(file_path) as pdf:
         for page_number, page in enumerate(pdf.pages, start=1):
             extracted_tables = page.extract_tables()
             for table_index, table in enumerate(extracted_tables):
                 df = pd.DataFrame(table[1:], columns=table[0])  # Use the first row as headers
                 table_info = {
     return all_tables
 def aggregate_and_save_tables(tables_list):
+    json_metadata = {'tables': [], 'total_tables': len(tables_list)}
     current_combined_table = None
     combined_csv_files = []
         page_number = table_info['page_number']
         df = table_info['dataframe']
+        # Check if headers match for logical table continuation
         if current_combined_table is None:
             current_combined_table = df
         else:
+            # Ensure headers are aligned in structure
+            headers_match = len(current_combined_table.columns) == len(df.columns) and \
+                            (current_combined_table.columns == df.columns).all()
+            if headers_match:
                 current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
             else:
+                # Save the current table as CSV
                 file_name = f"combined_table_page_{page_number - 1}.csv"
                 current_combined_table.to_csv(file_name, index=False)
                 combined_csv_files.append(file_name)
                     'description': f"Combined table up to page {page_number - 1}",
                     'rows': current_combined_table.shape[0]
                 })
+                # Start a new table
                 current_combined_table = df
+    # Save the last combined table if it exists
     if current_combined_table is not None:
         file_name = f"combined_table_final.csv"
         current_combined_table.to_csv(file_name, index=False)
 uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if uploaded_file is not None:
     temp_file_path = "temp_uploaded_file.pdf"
     with open(temp_file_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     try:
         extracted_tables = extract_tables(temp_file_path)
+        os.remove(temp_file_path)
         combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
         st.success("Table extraction and aggregation complete.")
         for csv_file in combined_csvs:
             st.download_button(
                 label=f"Download {csv_file}",
                 mime='text/csv'
             )
         st.download_button(
             label="Download JSON Metadata",
             data=open(metadata_file, 'r').read(),