Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

File size: 4,164 Bytes

87d1db7
 
70b87d3
d3777b4
87d1db7
 
 
2b00e5d
 
 
725126b
ca2c678
 
725126b
2b00e5d
725126b
2b00e5d
 
 
87d1db7
2b00e5d
712a7bd
725126b
 
 
 
712a7bd
 
 
 
725126b
712a7bd
 
 
 
725126b
 
 
 
 
 
 
 
 
 
 
 
 
2b00e5d
 
87d1db7
725126b
 
2b00e5d
712a7bd
87d1db7
ca2c678
2b00e5d
ca2c678
725126b
 
2b00e5d
 
 
ca2c678
 
725126b
2b00e5d
 
712a7bd
 
ca2c678
 
2b00e5d
 
ca2c678
 
87d1db7
ca2c678
87d1db7
725126b
70b87d3
2b00e5d
70b87d3
2b00e5d
70b87d3
 
2b00e5d
 
 
 
 
725126b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87d1db7
70b87d3
2b00e5d

import subprocess
subprocess.check_call(['pip', 'install', 'camelot-py[cv]'])  # Install Camelot with OpenCV support
import streamlit as st
import pdfplumber
import pandas as pd
import json

def extract_tables_with_pdfplumber(file):
    tables = []
    with pdfplumber.open(file) as pdf:
        for page_number, page in enumerate(pdf.pages, 1):  # Start page numbers at 1
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                if table:  # Check if a table was actually extracted
                    tables.append({
                        'page_number': page_number,
                        'data': table
                    })
    return tables

def process_tables(tables):
    def create_dataframe(table_data):
        # Handle cases where headers are missing or inconsistent
        if not table_data or not table_data[0]:
            return pd.DataFrame()  # Return an empty DataFrame if no headers are found

        headers = table_data[0]
        unique_headers = []
        for header in headers:
            if header in unique_headers:
                new_header = f"{header}_{len(unique_headers)}"
                unique_headers.append(new_header)
            else:
                unique_headers.append(header)
        return pd.DataFrame(table_data[1:], columns=unique_headers)


    all_tables_df = []
    for table_data in tables:
        df = create_dataframe(table_data['data'])
        if not df.empty:
            all_tables_df.append(df)

    if not all_tables_df:
      return None, None, {}


    table1_df = pd.concat(all_tables_df[:-1], ignore_index=True) # All but the last table
    table1_filename = "table1.csv"
    table1_df.to_csv(table1_filename, index=False)


    table2_df = all_tables_df[-1] # The last table
    table2_filename = "table2.csv"
    table2_df.to_csv(table2_filename, index=False)

    context = {
        'tables_extracted': len(tables),
        'table1': {
            'description': 'Table 1 (potentially multi-page)',
            'pages': [t['page_number'] for t in tables if not t['data'][0] == tables[-1]['data'][0] ],
            'csv_path': table1_filename,
            'rows': table1_df.shape[0],
            'columns': table1_df.shape[1]
        },
        'table2': {
            'description': 'Table 2 (last page)',
            'pages': tables[-1]['page_number'],
            'csv_path': table2_filename,
            'rows': table2_df.shape[0],
            'columns': table2_df.shape[1]
        }
    }

    context_filename = "tables_context.json"
    with open(context_filename, 'w') as json_file:
        json.dump(context, json_file, indent=4)

    return table1_filename, table2_filename, context_filename


# Streamlit Web App
st.title("PDF Table Extractor with pdfplumber")

uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
    try:
        tables = extract_tables_with_pdfplumber(uploaded_file)
        if not tables:
            st.error("No tables were extracted from the PDF.")
        else:
            table1, table2, context = process_tables(tables)
            if table1 and table2:
                st.success("Extraction complete.")

                # Provide download links for the output files
                st.download_button(
                    label="Download Table 1 CSV",
                    data=open(table1, 'r').read(),
                    file_name=table1,
                    mime='text/csv'
                )

                st.download_button(
                    label="Download Table 2 CSV",
                    data=open(table2, 'r').read(),
                    file_name=table2,
                    mime='text/csv'
                )

                st.download_button(
                    label="Download Context JSON",
                    data=open(context, 'r').read(),
                    file_name=context,
                    mime='application/json'
                )
            else:
                st.error("Not enough tables found in PDF. Minimum of 2 tables are needed.")

    except Exception as e:
        st.error(f"An error occurred: {e}")