Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pdfplumber | |
import pytesseract | |
from pdf2image import convert_from_path | |
import pandas as pd | |
import shutil | |
# Check if Tesseract is installed | |
TESSERACT_AVAILABLE = shutil.which("tesseract") is not None | |
if TESSERACT_AVAILABLE: | |
print("Tesseract OCR found, OCR functionality is enabled.") | |
else: | |
print("Tesseract OCR not found, proceeding without OCR functionality.") | |
# Function to perform OCR on image | |
def extract_text_with_ocr(image): | |
"""Extracts text from an image using Tesseract OCR.""" | |
if not TESSERACT_AVAILABLE: | |
raise RuntimeError("Tesseract OCR is not available. Please install Tesseract OCR and ensure it is in the system PATH.") | |
ocr_data = pytesseract.image_to_data(image, output_type='dict') | |
ocr_df = pd.DataFrame(ocr_data) | |
ocr_df = ocr_df[ocr_df['text'].notna()] # Filter out empty text | |
return ocr_df | |
# Extract tables and text from PDF pages | |
def extract_table_from_pdf(pdf, columns): | |
"""Extract tables or perform OCR for unstructured pages in the PDF.""" | |
extracted_data = [] | |
with pdfplumber.open(pdf) as pdf: | |
for page_num, page in enumerate(pdf.pages): | |
table = page.extract_table() | |
if table: | |
# Handle structured table | |
df = pd.DataFrame(table[1:], columns=table[0]) | |
if all(col in df.columns for col in columns): # Filter only specified columns | |
df = df[columns] | |
extracted_data.append(df) | |
elif TESSERACT_AVAILABLE: | |
# Handle unstructured tables using OCR if Tesseract is available | |
image = page.to_image().original | |
ocr_data = extract_text_with_ocr(image) | |
# Basic parsing for OCR data, assuming columns | |
ocr_text_rows = ocr_data.groupby('block_num')['text'].apply(lambda x: ' '.join(x)).tolist() | |
ocr_df = pd.DataFrame([row.split() for row in ocr_text_rows], columns=columns[:len(ocr_text_rows[0].split())]) | |
extracted_data.append(ocr_df) | |
# Combine all extracted tables into a single DataFrame, or return None if empty | |
if extracted_data: | |
result_df = pd.concat(extracted_data, ignore_index=True) | |
return result_df | |
else: | |
return None | |
# Process single or multiple PDFs and ensure at least one sheet is visible | |
def process_pdfs(files, columns): | |
"""Process multiple PDF files and save extracted data to an Excel file.""" | |
columns = [col.strip() for col in columns.split(",")] | |
output_path = "extracted_tables_combined.xlsx" | |
with pd.ExcelWriter(output_path) as writer: | |
# Placeholder sheet to avoid "At least one sheet must be visible" error | |
placeholder_df = pd.DataFrame([["No data found"]], columns=["Message"]) | |
placeholder_df.to_excel(writer, sheet_name="Placeholder", index=False) | |
sheets_written = False # Track if any actual data sheets were written | |
for idx, file in enumerate(files): | |
df = extract_table_from_pdf(file, columns) | |
if df is not None and not df.empty: | |
df.to_excel(writer, sheet_name=f"PDF_{idx + 1}", index=False) | |
sheets_written = True | |
# Remove the placeholder sheet if any data sheets were written | |
if sheets_written: | |
workbook = writer.book | |
if "Placeholder" in workbook.sheetnames: | |
workbook.remove(workbook["Placeholder"]) | |
return output_path | |
# Gradio interface function | |
def gradio_interface(files, columns): | |
try: | |
file_path = process_pdfs(files, columns) | |
return file_path, "" | |
except RuntimeError as e: | |
return None, str(e) | |
# Gradio application setup with dual outputs (file and error message) | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[ | |
gr.File(file_count="multiple", label="Upload PDFs"), | |
gr.Textbox(label="Enter Required Columns (comma-separated, e.g., 'Sl No, Material Description, Quantity')") | |
], | |
outputs=[ | |
gr.File(label="Download Extracted Tables Excel"), | |
gr.Textbox(label="Error Message", placeholder="Any errors will appear here.") | |
], | |
title="Advanced PDF Table Extractor", | |
description="Upload PDFs, specify columns, and download tables in an Excel file. Extracts tables from both text and scanned images if Tesseract OCR is available." | |
) | |
iface.launch() | |