Spaces:

Guhanselvam
/

PDF_Table_Extractor

Runtime error

App Files Files Community

PDF_Table_Extractor / app.py

Guhanselvam

Update app.py

725126b verified about 1 year ago

raw

history blame contribute delete

4.16 kB

	import subprocess
	subprocess.check_call(['pip', 'install', 'camelot-py[cv]']) # Install Camelot with OpenCV support
	import streamlit as st
	import pdfplumber
	import pandas as pd
	import json

	def extract_tables_with_pdfplumber(file):
	tables = []
	with pdfplumber.open(file) as pdf:
	for page_number, page in enumerate(pdf.pages, 1): # Start page numbers at 1
	extracted_tables = page.extract_tables()
	for table in extracted_tables:
	if table: # Check if a table was actually extracted
	tables.append({
	'page_number': page_number,
	'data': table
	})
	return tables

	def process_tables(tables):
	def create_dataframe(table_data):
	# Handle cases where headers are missing or inconsistent
	if not table_data or not table_data[0]:
	return pd.DataFrame() # Return an empty DataFrame if no headers are found

	headers = table_data[0]
	unique_headers = []
	for header in headers:
	if header in unique_headers:
	new_header = f"{header}_{len(unique_headers)}"
	unique_headers.append(new_header)
	else:
	unique_headers.append(header)
	return pd.DataFrame(table_data[1:], columns=unique_headers)


	all_tables_df = []
	for table_data in tables:
	df = create_dataframe(table_data['data'])
	if not df.empty:
	all_tables_df.append(df)

	if not all_tables_df:
	return None, None, {}


	table1_df = pd.concat(all_tables_df[:-1], ignore_index=True) # All but the last table
	table1_filename = "table1.csv"
	table1_df.to_csv(table1_filename, index=False)


	table2_df = all_tables_df[-1] # The last table
	table2_filename = "table2.csv"
	table2_df.to_csv(table2_filename, index=False)

	context = {
	'tables_extracted': len(tables),
	'table1': {
	'description': 'Table 1 (potentially multi-page)',
	'pages': [t['page_number'] for t in tables if not t['data'][0] == tables[-1]['data'][0] ],
	'csv_path': table1_filename,
	'rows': table1_df.shape[0],
	'columns': table1_df.shape[1]
	},
	'table2': {
	'description': 'Table 2 (last page)',
	'pages': tables[-1]['page_number'],
	'csv_path': table2_filename,
	'rows': table2_df.shape[0],
	'columns': table2_df.shape[1]
	}
	}

	context_filename = "tables_context.json"
	with open(context_filename, 'w') as json_file:
	json.dump(context, json_file, indent=4)

	return table1_filename, table2_filename, context_filename


	# Streamlit Web App
	st.title("PDF Table Extractor with pdfplumber")

	uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
	if uploaded_file is not None:
	try:
	tables = extract_tables_with_pdfplumber(uploaded_file)
	if not tables:
	st.error("No tables were extracted from the PDF.")
	else:
	table1, table2, context = process_tables(tables)
	if table1 and table2:
	st.success("Extraction complete.")

	# Provide download links for the output files
	st.download_button(
	label="Download Table 1 CSV",
	data=open(table1, 'r').read(),
	file_name=table1,
	mime='text/csv'
	)

	st.download_button(
	label="Download Table 2 CSV",
	data=open(table2, 'r').read(),
	file_name=table2,
	mime='text/csv'
	)

	st.download_button(
	label="Download Context JSON",
	data=open(context, 'r').read(),
	file_name=context,
	mime='application/json'
	)
	else:
	st.error("Not enough tables found in PDF. Minimum of 2 tables are needed.")

	except Exception as e:
	st.error(f"An error occurred: {e}")