pdf2excel / app.py
neerajkalyank's picture
Update app.py
91cadca verified
import gradio as gr
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import shutil
# Check if Tesseract is installed
TESSERACT_AVAILABLE = shutil.which("tesseract") is not None
if TESSERACT_AVAILABLE:
print("Tesseract OCR found, OCR functionality is enabled.")
else:
print("Tesseract OCR not found, proceeding without OCR functionality.")
# Function to perform OCR on image
def extract_text_with_ocr(image):
"""Extracts text from an image using Tesseract OCR."""
if not TESSERACT_AVAILABLE:
raise RuntimeError("Tesseract OCR is not available. Please install Tesseract OCR and ensure it is in the system PATH.")
ocr_data = pytesseract.image_to_data(image, output_type='dict')
ocr_df = pd.DataFrame(ocr_data)
ocr_df = ocr_df[ocr_df['text'].notna()] # Filter out empty text
return ocr_df
# Extract tables and text from PDF pages
def extract_table_from_pdf(pdf, columns):
"""Extract tables or perform OCR for unstructured pages in the PDF."""
extracted_data = []
with pdfplumber.open(pdf) as pdf:
for page_num, page in enumerate(pdf.pages):
table = page.extract_table()
if table:
# Handle structured table
df = pd.DataFrame(table[1:], columns=table[0])
if all(col in df.columns for col in columns): # Filter only specified columns
df = df[columns]
extracted_data.append(df)
elif TESSERACT_AVAILABLE:
# Handle unstructured tables using OCR if Tesseract is available
image = page.to_image().original
ocr_data = extract_text_with_ocr(image)
# Basic parsing for OCR data, assuming columns
ocr_text_rows = ocr_data.groupby('block_num')['text'].apply(lambda x: ' '.join(x)).tolist()
ocr_df = pd.DataFrame([row.split() for row in ocr_text_rows], columns=columns[:len(ocr_text_rows[0].split())])
extracted_data.append(ocr_df)
# Combine all extracted tables into a single DataFrame, or return None if empty
if extracted_data:
result_df = pd.concat(extracted_data, ignore_index=True)
return result_df
else:
return None
# Process single or multiple PDFs and ensure at least one sheet is visible
def process_pdfs(files, columns):
"""Process multiple PDF files and save extracted data to an Excel file."""
columns = [col.strip() for col in columns.split(",")]
output_path = "extracted_tables_combined.xlsx"
with pd.ExcelWriter(output_path) as writer:
# Placeholder sheet to avoid "At least one sheet must be visible" error
placeholder_df = pd.DataFrame([["No data found"]], columns=["Message"])
placeholder_df.to_excel(writer, sheet_name="Placeholder", index=False)
sheets_written = False # Track if any actual data sheets were written
for idx, file in enumerate(files):
df = extract_table_from_pdf(file, columns)
if df is not None and not df.empty:
df.to_excel(writer, sheet_name=f"PDF_{idx + 1}", index=False)
sheets_written = True
# Remove the placeholder sheet if any data sheets were written
if sheets_written:
workbook = writer.book
if "Placeholder" in workbook.sheetnames:
workbook.remove(workbook["Placeholder"])
return output_path
# Gradio interface function
def gradio_interface(files, columns):
try:
file_path = process_pdfs(files, columns)
return file_path, ""
except RuntimeError as e:
return None, str(e)
# Gradio application setup with dual outputs (file and error message)
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(file_count="multiple", label="Upload PDFs"),
gr.Textbox(label="Enter Required Columns (comma-separated, e.g., 'Sl No, Material Description, Quantity')")
],
outputs=[
gr.File(label="Download Extracted Tables Excel"),
gr.Textbox(label="Error Message", placeholder="Any errors will appear here.")
],
title="Advanced PDF Table Extractor",
description="Upload PDFs, specify columns, and download tables in an Excel file. Extracts tables from both text and scanned images if Tesseract OCR is available."
)
iface.launch()