MatDatabase_Dev

Running

Upload folder using huggingface_hub

1adc2e7 verified 30 days ago

1.91 kB

	import streamlit as st
	import pandas as pd
	import tabula
	import pymupdf
	import os
	from tqdm import tqdm


	def extract_tables_pymupdf(pdf_path):
	"""Extract tables using PyMuPDF (alternative method)"""
	try:
	doc = pymupdf.open(pdf_path)
	all_tables = []

	for page_num in range(len(doc)):
	page = doc[page_num]
	tables = page.find_tables()

	for table in tables:
	# Extract table data
	table_data = table.extract()
	if table_data:
	# Convert to DataFrame
	df = pd.DataFrame(table_data[1:], columns=table_data[0])
	all_tables.append({
	'page': page_num + 1,
	'dataframe': df
	})

	doc.close()
	return all_tables
	except Exception as e:
	st.error(f"Error extracting tables with PyMuPDF: {e}")
	return []

	def main():
	st.title("PDF Table Extractor")
	st.write("Upload a PDF to extract all tables")

	temp_path = "temp_uploaded.pdf" # Define here

	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	# Save uploaded file temporarily
	with open(temp_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Using PyMuPDF
	tables = extract_tables_pymupdf(temp_path)

	if tables:
	st.success(f"Found {len(tables)} tables!")

	for idx, table_info in enumerate(tables):
	st.subheader(f"Table {idx + 1} (Page {table_info['page']})")
	df = table_info['dataframe']
	st.dataframe(df, use_container_width=True)

	# Clean up temp file
	if os.path.exists(temp_path):
	os.remove(temp_path)