Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

App Files Files Community

Guhanselvam commited on Oct 21, 2024

Commit

2b00e5d

verified ·

1 Parent(s): 70b87d3

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -57

app.py CHANGED Viewed

@@ -4,86 +4,92 @@ import streamlit as st
 import pdfplumber
 import pandas as pd
 import json
-import os
-def extract_tables_from_pdf(file_stream):
-    tables_data = {
-        'table1': [],
-        'table2': None
-    }
-    with pdfplumber.open(file_stream) as pdf:
-        # Extract pages for Table 1
-        for page_number in range(3):  # Table 1 spans from page 1 to 3
-            page = pdf.pages[page_number]
             extracted_tables = page.extract_tables()
             for table in extracted_tables:
-                df = pd.DataFrame(table[1:], columns=table[0])  # Assume the first row as headers
-                tables_data['table1'].append(df)
-        # Extract last page for Table 2
-        last_page_number = len(pdf.pages) - 1
-        last_page = pdf.pages[last_page_number]
-        extracted_tables = last_page.extract_tables()
-        if extracted_tables:
-            tables_data['table2'] = pd.DataFrame(extracted_tables[0][1:], columns=extracted_tables[0][0])
-    return tables_data
-def save_data(tables_data):
-    current_dir = os.getcwd()
-    # Combine and save Table 1 data
-    table1_filename = os.path.join(current_dir, 'table1.csv')
-    if tables_data['table1']:
-        table1_combined = pd.concat(tables_data['table1'], ignore_index=True)
-        table1_combined.to_csv(table1_filename, index=False)
-    # Save Table 2 data
-    table2_filename = os.path.join(current_dir, 'table2.csv')
-    if tables_data['table2'] is not None:
-        tables_data['table2'].to_csv(table2_filename, index=False)
-    # Create JSON context
     context = {
         'table1': {
-            'description': 'Table 1 extracted from pages 1 to 3',
-            'source': table1_filename,
-            'rows': table1_combined.shape[0] if tables_data['table1'] else 0
         },
         'table2': {
             'description': 'Table 2 extracted from the last page',
-            'source': table2_filename,
-            'rows': tables_data['table2'].shape[0] if tables_data['table2'] is not None else 0
         }
     }
-    context_filename = os.path.join(current_dir, 'tables_context.json')
     with open(context_filename, 'w') as json_file:
         json.dump(context, json_file, indent=4)
     return table1_filename, table2_filename, context_filename
 # Streamlit Web App
-st.title("PDF Table Extractor")
-uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
 if uploaded_file is not None:
     try:
-        tables_data = extract_tables_from_pdf(uploaded_file)
-        table1_filename, table2_filename, context_filename = save_data(tables_data)
-        st.success("Extraction complete.")
-        # Provide download links for the output files
-        with open(table1_filename, 'rb') as f:
-            st.download_button("Download Table 1 CSV", data=f, file_name='table1.csv', mime='text/csv')
-        with open(table2_filename, 'rb') as f:
-            st.download_button("Download Table 2 CSV", data=f, file_name='table2.csv', mime='text/csv')
-        with open(context_filename, 'rb') as f:
-            st.download_button("Download JSON Context", data=f, file_name='tables_context.json', mime='application/json')
     except Exception as e:
-        st.error(f"An error occurred during processing: {e}")

 import pdfplumber
 import pandas as pd
 import json
+def extract_tables_with_pdfplumber(file):
+    tables = []
+    with pdfplumber.open(file) as pdf:
+        for page_number, page in enumerate(pdf.pages):
+            # Extract existing tables from the current page
             extracted_tables = page.extract_tables()
             for table in extracted_tables:
+                if table:
+                    tables.append({
+                        'page_number': page_number + 1,
+                        'data': table
+                    })
+    return tables
+def process_tables(tables):
+    # Extract the first three tables assumed to be part of Table 1
+    table1_data = [pd.DataFrame(t['data'][1:], columns=t['data'][0]) for t in tables[:3]]
+    table1_df = pd.concat(table1_data, ignore_index=True)
+    table1_filename = "table1.csv"
+    table1_df.to_csv(table1_filename, index=False)
+    # Extract the last table assumed to be Table 2
+    table2_data = pd.DataFrame(tables[-1]['data'][1:], columns=tables[-1]['data'][0])
+    table2_filename = "table2.csv"
+    table2_data.to_csv(table2_filename, index=False)
+    # Prepare context JSON with detailed context
     context = {
+        'tables_extracted': len(tables),
         'table1': {
+            'description': 'Table 1 extracted from the first 3 pages',
+            'pages': [t['page_number'] for t in tables[:3]],
+            'csv_path': table1_filename,
+            'rows': table1_df.shape[0],
+            'columns': table1_df.shape[1]
         },
         'table2': {
             'description': 'Table 2 extracted from the last page',
+            'pages': tables[-1]['page_number'],
+            'csv_path': table2_filename,
+            'rows': table2_data.shape[0],
+            'columns': table2_data.shape[1]
         }
     }
+    context_filename = "tables_context.json"
     with open(context_filename, 'w') as json_file:
         json.dump(context, json_file, indent=4)
     return table1_filename, table2_filename, context_filename
 # Streamlit Web App
+st.title("PDF Table Extractor with pdfplumber")
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if uploaded_file is not None:
     try:
+        tables = extract_tables_with_pdfplumber(uploaded_file)
+        if not tables:
+            st.error("No tables were extracted from the PDF.")
+        else:
+            table1, table2, context = process_tables(tables)
+            st.success("Extraction complete.")
+            # Provide download links for the output files
+            st.download_button(
+                label="Download Table 1 CSV",
+                data=open(table1, 'r').read(),
+                file_name=table1,
+                mime='text/csv'
+            )
+            st.download_button(
+                label="Download Table 2 CSV",
+                data=open(table2, 'r').read(),
+                file_name=table2,
+                mime='text/csv'
+            )
+            st.download_button(
+                label="Download Context JSON",
+                data=open(context, 'r').read(),
+                file_name=context,
+                mime='application/json'
+            )
     except Exception as e:
+        st.error(f"An error occurred: {e}")