Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pdfplumber | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| import pandas as pd | |
| import shutil | |
| # Check if Tesseract is installed | |
| TESSERACT_AVAILABLE = shutil.which("tesseract") is not None | |
| if TESSERACT_AVAILABLE: | |
| print("Tesseract OCR found, OCR functionality is enabled.") | |
| else: | |
| print("Tesseract OCR not found, proceeding without OCR functionality.") | |
| # Function to perform OCR on image | |
| def extract_text_with_ocr(image): | |
| """Extracts text from an image using Tesseract OCR.""" | |
| if not TESSERACT_AVAILABLE: | |
| raise RuntimeError("Tesseract OCR is not available. Please install Tesseract OCR and ensure it is in the system PATH.") | |
| ocr_data = pytesseract.image_to_data(image, output_type='dict') | |
| ocr_df = pd.DataFrame(ocr_data) | |
| ocr_df = ocr_df[ocr_df['text'].notna()] # Filter out empty text | |
| return ocr_df | |
| # Extract tables and text from PDF pages | |
| def extract_table_from_pdf(pdf, columns): | |
| """Extract tables or perform OCR for unstructured pages in the PDF.""" | |
| extracted_data = [] | |
| with pdfplumber.open(pdf) as pdf: | |
| for page_num, page in enumerate(pdf.pages): | |
| table = page.extract_table() | |
| if table: | |
| # Handle structured table | |
| df = pd.DataFrame(table[1:], columns=table[0]) | |
| if all(col in df.columns for col in columns): # Filter only specified columns | |
| df = df[columns] | |
| extracted_data.append(df) | |
| elif TESSERACT_AVAILABLE: | |
| # Handle unstructured tables using OCR if Tesseract is available | |
| image = page.to_image().original | |
| ocr_data = extract_text_with_ocr(image) | |
| # Basic parsing for OCR data, assuming columns | |
| ocr_text_rows = ocr_data.groupby('block_num')['text'].apply(lambda x: ' '.join(x)).tolist() | |
| ocr_df = pd.DataFrame([row.split() for row in ocr_text_rows], columns=columns[:len(ocr_text_rows[0].split())]) | |
| extracted_data.append(ocr_df) | |
| # Combine all extracted tables into a single DataFrame, or return None if empty | |
| if extracted_data: | |
| result_df = pd.concat(extracted_data, ignore_index=True) | |
| return result_df | |
| else: | |
| return None | |
| # Process single or multiple PDFs and ensure at least one sheet is visible | |
| def process_pdfs(files, columns): | |
| """Process multiple PDF files and save extracted data to an Excel file.""" | |
| columns = [col.strip() for col in columns.split(",")] | |
| output_path = "extracted_tables_combined.xlsx" | |
| with pd.ExcelWriter(output_path) as writer: | |
| # Placeholder sheet to avoid "At least one sheet must be visible" error | |
| placeholder_df = pd.DataFrame([["No data found"]], columns=["Message"]) | |
| placeholder_df.to_excel(writer, sheet_name="Placeholder", index=False) | |
| sheets_written = False # Track if any actual data sheets were written | |
| for idx, file in enumerate(files): | |
| df = extract_table_from_pdf(file, columns) | |
| if df is not None and not df.empty: | |
| df.to_excel(writer, sheet_name=f"PDF_{idx + 1}", index=False) | |
| sheets_written = True | |
| # Remove the placeholder sheet if any data sheets were written | |
| if sheets_written: | |
| workbook = writer.book | |
| if "Placeholder" in workbook.sheetnames: | |
| workbook.remove(workbook["Placeholder"]) | |
| return output_path | |
| # Gradio interface function | |
| def gradio_interface(files, columns): | |
| try: | |
| file_path = process_pdfs(files, columns) | |
| return file_path, "" | |
| except RuntimeError as e: | |
| return None, str(e) | |
| # Gradio application setup with dual outputs (file and error message) | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.File(file_count="multiple", label="Upload PDFs"), | |
| gr.Textbox(label="Enter Required Columns (comma-separated, e.g., 'Sl No, Material Description, Quantity')") | |
| ], | |
| outputs=[ | |
| gr.File(label="Download Extracted Tables Excel"), | |
| gr.Textbox(label="Error Message", placeholder="Any errors will appear here.") | |
| ], | |
| title="Advanced PDF Table Extractor", | |
| description="Upload PDFs, specify columns, and download tables in an Excel file. Extracts tables from both text and scanned images if Tesseract OCR is available." | |
| ) | |
| iface.launch() | |