Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

Guhanselvam commited on Oct 21, 2024

Commit

712a7bd

verified ·

1 Parent(s): 2b00e5d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -20,16 +20,28 @@ def extract_tables_with_pdfplumber(file):
     return tables
 def process_tables(tables):
-    # Extract the first three tables assumed to be part of Table 1
-    table1_data = [pd.DataFrame(t['data'][1:], columns=t['data'][0]) for t in tables[:3]]
     table1_df = pd.concat(table1_data, ignore_index=True)
     table1_filename = "table1.csv"
     table1_df.to_csv(table1_filename, index=False)
-    # Extract the last table assumed to be Table 2
-    table2_data = pd.DataFrame(tables[-1]['data'][1:], columns=tables[-1]['data'][0])
     table2_filename = "table2.csv"
-    table2_data.to_csv(table2_filename, index=False)
     # Prepare context JSON with detailed context
     context = {
@@ -45,8 +57,8 @@ def process_tables(tables):
             'description': 'Table 2 extracted from the last page',
             'pages': tables[-1]['page_number'],
             'csv_path': table2_filename,
-            'rows': table2_data.shape[0],
-            'columns': table2_data.shape[1]
         }
     }

     return tables
 def process_tables(tables):
+    def create_dataframe(table_data):
+        headers = table_data[0]
+        # Make headers unique if they are not
+        unique_headers = []
+        for header in headers:
+            if header in unique_headers:
+                new_header = f"{header}_{unique_headers.count(header)}"
+                unique_headers.append(new_header)
+            else:
+                unique_headers.append(header)
+        return pd.DataFrame(table_data[1:], columns=unique_headers)
+    # Extract and process the tables into DataFrames
+    table1_data = [create_dataframe(t['data']) for t in tables[:3]]
     table1_df = pd.concat(table1_data, ignore_index=True)
     table1_filename = "table1.csv"
     table1_df.to_csv(table1_filename, index=False)
+    # Extract and process the last table assumed to be Table 2
+    table2_df = create_dataframe(tables[-1]['data'])
     table2_filename = "table2.csv"
+    table2_df.to_csv(table2_filename, index=False)
     # Prepare context JSON with detailed context
     context = {
             'description': 'Table 2 extracted from the last page',
             'pages': tables[-1]['page_number'],
             'csv_path': table2_filename,
+            'rows': table2_df.shape[0],
+            'columns': table2_df.shape[1]
         }
     }