Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

App Files Files Community

Guhanselvam commited on Oct 21, 2024

Commit

ca2c678

verified ·

1 Parent(s): 1c1eb47

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -91

app.py CHANGED Viewed

@@ -1,107 +1,74 @@
 import subprocess
 subprocess.check_call(['pip', 'install', 'camelot-py[cv]'])  # Install Camelot with OpenCV support
-import streamlit as st
 import pdfplumber
 import pandas as pd
 import json
-import os
-def extract_tables(file_path):
-    all_tables = []
-    with pdfplumber.open(file_path) as pdf:
-        for page_number, page in enumerate(pdf.pages, start=1):
-            extracted_tables = page.extract_tables()
-            for table_index, table in enumerate(extracted_tables):
-                df = pd.DataFrame(table[1:], columns=table[0])  # Use the first row as headers
-                table_info = {
-                    'page_number': page_number,
-                    'table_index': table_index,
-                    'dataframe': df
-                }
-                all_tables.append(table_info)
-    return all_tables
-def aggregate_and_save_tables(tables_list):
-    json_metadata = {'tables': [], 'total_tables': len(tables_list)}
-    current_combined_table = None
-    combined_csv_files = []
-    for table_info in tables_list:
-        page_number = table_info['page_number']
-        df = table_info['dataframe']
-        # Check if headers match for logical table continuation
-        if current_combined_table is None:
-            current_combined_table = df
-        else:
-            # Ensure headers are aligned in structure
-            headers_match = len(current_combined_table.columns) == len(df.columns) and \
-                            (current_combined_table.columns == df.columns).all()
-            if headers_match:
-                current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
-            else:
-                # Save the current table as CSV
-                file_name = f"combined_table_page_{page_number - 1}.csv"
-                current_combined_table.to_csv(file_name, index=False)
-                combined_csv_files.append(file_name)
-                json_metadata['tables'].append({
-                    'csv_file': file_name,
-                    'description': f"Combined table up to page {page_number - 1}",
-                    'rows': current_combined_table.shape[0]
-                })
-                # Start a new table
-                current_combined_table = df
-    # Save the last combined table if it exists
-    if current_combined_table is not None:
-        file_name = f"combined_table_final.csv"
-        current_combined_table.to_csv(file_name, index=False)
-        combined_csv_files.append(file_name)
-        json_metadata['tables'].append({
-            'csv_file': file_name,
-            'description': "Final combined table",
-            'rows': current_combined_table.shape[0]
-        })
-    # Save overall metadata to JSON
-    json_filename = "tables_metadata.json"
-    with open(json_filename, 'w') as json_file:
-        json.dump(json_metadata, json_file, indent=4)
-    return combined_csv_files, json_filename
-# Streamlit Web App
-st.title("PDF Table Extractor for Combined Tables")
-uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-if uploaded_file is not None:
-    temp_file_path = "temp_uploaded_file.pdf"
-    with open(temp_file_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    try:
-        extracted_tables = extract_tables(temp_file_path)
-        os.remove(temp_file_path)
-        combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
-        st.success("Table extraction and aggregation complete.")
-        for csv_file in combined_csvs:
-            st.download_button(
-                label=f"Download {csv_file}",
-                data=open(csv_file, 'r').read(),
-                file_name=csv_file,
-                mime='text/csv'
-            )
-        st.download_button(
-            label="Download JSON Metadata",
-            data=open(metadata_file, 'r').read(),
-            file_name=metadata_file,
-            mime='application/json'
-        )
-    except Exception as e:
-        st.error(f"An error occurred during processing: {e}")

 import subprocess
 subprocess.check_call(['pip', 'install', 'camelot-py[cv]'])  # Install Camelot with OpenCV support
 import pdfplumber
 import pandas as pd
 import json
+def extract_tables_from_pdf(pdf_path):
+    tables_data = {
+        'table1': [],
+        'table2': None
+    }
+    with pdfplumber.open(pdf_path) as pdf:
+        # Extract pages for Table 1
+        for page_number in range(3):  # Table 1 spans from page 1 to 3
+            page = pdf.pages[page_number]
+            extracted_tables = page.extract_tables()
+            for table in extracted_tables:
+                df = pd.DataFrame(table[1:], columns=table[0])  # Assume the first row as headers
+                tables_data['table1'].append(df)
+        # Extract last page for Table 2
+        last_page_number = len(pdf.pages) - 1
+        last_page = pdf.pages[last_page_number]
+        extracted_tables = last_page.extract_tables()
+        if extracted_tables:
+            tables_data['table2'] = pd.DataFrame(extracted_tables[0][1:], columns=extracted_tables[0][0])
+    return tables_data
+def save_data(tables_data):
+    # Combine and save Table 1 data
+    if tables_data['table1']:
+        table1_combined = pd.concat(tables_data['table1'], ignore_index=True)
+        table1_filename = 'table1.csv'
+        table1_combined.to_csv(table1_filename, index=False)
+    # Save Table 2 data
+    table2_filename = 'table2.csv'
+    if tables_data['table2'] is not None:
+        tables_data['table2'].to_csv(table2_filename, index=False)
+    # Create JSON context
+    context = {
+        'table1': {
+            'description': 'Table 1 extracted from pages 1 to 3',
+            'source': table1_filename,
+            'rows': table1_combined.shape[0]
+        },
+        'table2': {
+            'description': 'Table 2 extracted from the last page',
+            'source': table2_filename,
+            'rows': tables_data['table2'].shape[0] if tables_data['table2'] is not None else 0
+        }
+    }
+    context_filename = 'tables_context.json'
+    with open(context_filename, 'w') as json_file:
+        json.dump(context, json_file, indent=4)
+    return table1_filename, table2_filename, context_filename
+# Main function
+def main(pdf_path):
+    tables_data = extract_tables_from_pdf(pdf_path)
+    table1_filename, table2_filename, context_filename = save_data(tables_data)
+    print(f"CSV file for Table 1 saved as: {table1_filename}")
+    print(f"CSV file for Table 2 saved as: {table2_filename}")
+    print(f"JSON context saved as: {context_filename}")
+# Example usage
+pdf_path = "your_pdf_file.pdf"  # Replace with your actual PDF file path
+main(pdf_path)