import gradio as gr import pdfplumber import pytesseract from pdf2image import convert_from_path import pandas as pd import shutil # Check if Tesseract is installed TESSERACT_AVAILABLE = shutil.which("tesseract") is not None if TESSERACT_AVAILABLE: print("Tesseract OCR found, OCR functionality is enabled.") else: print("Tesseract OCR not found, proceeding without OCR functionality.") # Function to perform OCR on image def extract_text_with_ocr(image): """Extracts text from an image using Tesseract OCR.""" if not TESSERACT_AVAILABLE: raise RuntimeError("Tesseract OCR is not available. Please install Tesseract OCR and ensure it is in the system PATH.") ocr_data = pytesseract.image_to_data(image, output_type='dict') ocr_df = pd.DataFrame(ocr_data) ocr_df = ocr_df[ocr_df['text'].notna()] # Filter out empty text return ocr_df # Extract tables and text from PDF pages def extract_table_from_pdf(pdf, columns): """Extract tables or perform OCR for unstructured pages in the PDF.""" extracted_data = [] with pdfplumber.open(pdf) as pdf: for page_num, page in enumerate(pdf.pages): table = page.extract_table() if table: # Handle structured table df = pd.DataFrame(table[1:], columns=table[0]) if all(col in df.columns for col in columns): # Filter only specified columns df = df[columns] extracted_data.append(df) elif TESSERACT_AVAILABLE: # Handle unstructured tables using OCR if Tesseract is available image = page.to_image().original ocr_data = extract_text_with_ocr(image) # Basic parsing for OCR data, assuming columns ocr_text_rows = ocr_data.groupby('block_num')['text'].apply(lambda x: ' '.join(x)).tolist() ocr_df = pd.DataFrame([row.split() for row in ocr_text_rows], columns=columns[:len(ocr_text_rows[0].split())]) extracted_data.append(ocr_df) # Combine all extracted tables into a single DataFrame, or return None if empty if extracted_data: result_df = pd.concat(extracted_data, ignore_index=True) return result_df else: return None # Process single or multiple PDFs and ensure at least one sheet is visible def process_pdfs(files, columns): """Process multiple PDF files and save extracted data to an Excel file.""" columns = [col.strip() for col in columns.split(",")] output_path = "extracted_tables_combined.xlsx" with pd.ExcelWriter(output_path) as writer: # Placeholder sheet to avoid "At least one sheet must be visible" error placeholder_df = pd.DataFrame([["No data found"]], columns=["Message"]) placeholder_df.to_excel(writer, sheet_name="Placeholder", index=False) sheets_written = False # Track if any actual data sheets were written for idx, file in enumerate(files): df = extract_table_from_pdf(file, columns) if df is not None and not df.empty: df.to_excel(writer, sheet_name=f"PDF_{idx + 1}", index=False) sheets_written = True # Remove the placeholder sheet if any data sheets were written if sheets_written: workbook = writer.book if "Placeholder" in workbook.sheetnames: workbook.remove(workbook["Placeholder"]) return output_path # Gradio interface function def gradio_interface(files, columns): try: file_path = process_pdfs(files, columns) return file_path, "" except RuntimeError as e: return None, str(e) # Gradio application setup with dual outputs (file and error message) iface = gr.Interface( fn=gradio_interface, inputs=[ gr.File(file_count="multiple", label="Upload PDFs"), gr.Textbox(label="Enter Required Columns (comma-separated, e.g., 'Sl No, Material Description, Quantity')") ], outputs=[ gr.File(label="Download Extracted Tables Excel"), gr.Textbox(label="Error Message", placeholder="Any errors will appear here.") ], title="Advanced PDF Table Extractor", description="Upload PDFs, specify columns, and download tables in an Excel file. Extracts tables from both text and scanned images if Tesseract OCR is available." ) iface.launch()