Spaces:

neerajkalyank
/

pdf2excel

Sleeping

App Files Files Community

pdf2excel / app.py

neerajkalyank

Update app.py

91cadca verified about 1 year ago

raw

history blame contribute delete

4.45 kB

	import gradio as gr
	import pdfplumber
	import pytesseract
	from pdf2image import convert_from_path
	import pandas as pd
	import shutil

	# Check if Tesseract is installed
	TESSERACT_AVAILABLE = shutil.which("tesseract") is not None

	if TESSERACT_AVAILABLE:
	print("Tesseract OCR found, OCR functionality is enabled.")
	else:
	print("Tesseract OCR not found, proceeding without OCR functionality.")

	# Function to perform OCR on image
	def extract_text_with_ocr(image):
	"""Extracts text from an image using Tesseract OCR."""
	if not TESSERACT_AVAILABLE:
	raise RuntimeError("Tesseract OCR is not available. Please install Tesseract OCR and ensure it is in the system PATH.")
	ocr_data = pytesseract.image_to_data(image, output_type='dict')
	ocr_df = pd.DataFrame(ocr_data)
	ocr_df = ocr_df[ocr_df['text'].notna()] # Filter out empty text
	return ocr_df

	# Extract tables and text from PDF pages
	def extract_table_from_pdf(pdf, columns):
	"""Extract tables or perform OCR for unstructured pages in the PDF."""
	extracted_data = []

	with pdfplumber.open(pdf) as pdf:
	for page_num, page in enumerate(pdf.pages):
	table = page.extract_table()

	if table:
	# Handle structured table
	df = pd.DataFrame(table[1:], columns=table[0])
	if all(col in df.columns for col in columns): # Filter only specified columns
	df = df[columns]
	extracted_data.append(df)
	elif TESSERACT_AVAILABLE:
	# Handle unstructured tables using OCR if Tesseract is available
	image = page.to_image().original
	ocr_data = extract_text_with_ocr(image)

	# Basic parsing for OCR data, assuming columns
	ocr_text_rows = ocr_data.groupby('block_num')['text'].apply(lambda x: ' '.join(x)).tolist()
	ocr_df = pd.DataFrame([row.split() for row in ocr_text_rows], columns=columns[:len(ocr_text_rows[0].split())])
	extracted_data.append(ocr_df)

	# Combine all extracted tables into a single DataFrame, or return None if empty
	if extracted_data:
	result_df = pd.concat(extracted_data, ignore_index=True)
	return result_df
	else:
	return None

	# Process single or multiple PDFs and ensure at least one sheet is visible
	def process_pdfs(files, columns):
	"""Process multiple PDF files and save extracted data to an Excel file."""
	columns = [col.strip() for col in columns.split(",")]
	output_path = "extracted_tables_combined.xlsx"

	with pd.ExcelWriter(output_path) as writer:
	# Placeholder sheet to avoid "At least one sheet must be visible" error
	placeholder_df = pd.DataFrame([["No data found"]], columns=["Message"])
	placeholder_df.to_excel(writer, sheet_name="Placeholder", index=False)

	sheets_written = False # Track if any actual data sheets were written

	for idx, file in enumerate(files):
	df = extract_table_from_pdf(file, columns)
	if df is not None and not df.empty:
	df.to_excel(writer, sheet_name=f"PDF_{idx + 1}", index=False)
	sheets_written = True

	# Remove the placeholder sheet if any data sheets were written
	if sheets_written:
	workbook = writer.book
	if "Placeholder" in workbook.sheetnames:
	workbook.remove(workbook["Placeholder"])

	return output_path

	# Gradio interface function
	def gradio_interface(files, columns):
	try:
	file_path = process_pdfs(files, columns)
	return file_path, ""
	except RuntimeError as e:
	return None, str(e)

	# Gradio application setup with dual outputs (file and error message)
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.File(file_count="multiple", label="Upload PDFs"),
	gr.Textbox(label="Enter Required Columns (comma-separated, e.g., 'Sl No, Material Description, Quantity')")
	],
	outputs=[
	gr.File(label="Download Extracted Tables Excel"),
	gr.Textbox(label="Error Message", placeholder="Any errors will appear here.")
	],
	title="Advanced PDF Table Extractor",
	description="Upload PDFs, specify columns, and download tables in an Excel file. Extracts tables from both text and scanned images if Tesseract OCR is available."
	)

	iface.launch()