Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

App Files Files Community

Guhanselvam commited on Oct 21, 2024

Commit

87d1db7

verified ·

1 Parent(s): 7cf7084

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -0

app.py CHANGED Viewed

	@@ -0,0 +1,78 @@

+import subprocess
+subprocess.check_call(['pip', 'install', 'camelot-py[cv]'])  # Install Camelot with OpenCV support
+import streamlit as st
+import camelot
+import pandas as pd
+import json
+import os
+def extract_tables_from_file(file_path):
+    # Extract tables from the PDF
+    tables = camelot.read_pdf(file_path, pages='1-end', multiple_tables=True)
+    # Table 1 spans from pages 1 to 3
+    table1_dfs = [tables[i].df for i in range(min(3, len(tables)))]
+    if table1_dfs:
+        table1_df = pd.concat(table1_dfs, ignore_index=True)
+        table1_filename = "table1.csv"
+        table1_df.to_csv(table1_filename, index=False)
+    # Assume Table 2 is on the last page
+    table2_df = tables[-1].df
+    table2_filename = "table2.csv"
+    table2_df.to_csv(table2_filename, index=False)
+    # Prepare context JSON
+    context = {
+        'table1': {'description': 'Table 1 extracted from pages 1 to 3', 'csv_path': table1_filename},
+        'table2': {'description': 'Table 2 extracted from the last page', 'csv_path': table2_filename},
+    }
+    context_filename = "tables_context.json"
+    with open(context_filename, 'w') as json_file:
+        json.dump(context, json_file, indent=4)
+    return table1_filename, table2_filename, context_filename
+# Streamlit Web App
+st.title("PDF Table Extractor")
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+if uploaded_file is not None:
+    # Save the uploaded file temporarily
+    temp_file_path = "temp_uploaded_file.pdf"
+    with open(temp_file_path, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    # Extract tables from the uploaded PDF file
+    try:
+        table1, table2, context = extract_tables_from_file(temp_file_path)
+        os.remove(temp_file_path)  # Clean up the temporary file
+        st.success("Extraction complete.")
+        # Provide download links for the output files
+        st.download_button(
+            label="Download Table 1 CSV",
+            data=open(table1, 'rb').read(),
+            file_name=table1,
+            mime='text/csv'
+        )
+        st.download_button(
+            label="Download Table 2 CSV",
+            data=open(table2, 'rb').read(),
+            file_name=table2,
+            mime='text/csv'
+        )
+        st.download_button(
+            label="Download Context JSON",
+            data=open(context, 'rb').read(),
+            file_name=context,
+            mime='application/json'
+        )
+    except Exception as e:
+        st.error(f"An error occurred: {e}")