import gradio as gr import pandas as pd import fitz # PyMuPDF for PDF reading import pytesseract # Tesseract for OCR from io import BytesIO import re from tqdm import tqdm # For progress indicators # Set Tesseract path if necessary pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract' # Update this path as necessary def ai_parse_pdf(pdf_file, language="eng"): try: # Check if pdf_file is a file-like object or path if hasattr(pdf_file, 'read'): # If it's a file-like object (e.g., BytesIO) pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") elif isinstance(pdf_file, str): # If it's a path in string format pdf_document = fitz.open(pdf_file) else: raise ValueError("Unsupported file type. Please upload a valid PDF file.") # Retrieve metadata metadata = pdf_document.metadata or {} print("Retrieved Metadata:", metadata) # Debugging step # Prepare DataFrame with dummy data to simplify debugging data = [{"Column1": "Test1", "Column2": "Test2"}] # Mock data for debugging df = pd.DataFrame(data) return df, metadata except Exception as e: print("Error in ai_parse_pdf:", e) # Debugging output return None, None def export_to_excel(df, metadata): try: output = BytesIO() with pd.ExcelWriter(output, engine='openpyxl') as writer: # Check if df is valid if df is None or df.empty: raise ValueError("DataFrame is empty or invalid.") df.to_excel(writer, index=False, sheet_name="Extracted Data") # Check metadata and write it only if it's a valid dictionary if metadata and isinstance(metadata, dict): metadata_df = pd.DataFrame.from_dict(metadata, orient="index", columns=["Value"]) metadata_df.to_excel(writer, sheet_name="Metadata") else: print("Metadata is empty or invalid, skipping metadata export.") output.seek(0) return output except Exception as e: print("Error in export_to_excel:", e) # Debugging output return None # Gradio function to extract, display, and cleanse data def extract_and_display_data(pdf_file, language): try: df, metadata = ai_parse_pdf(pdf_file, language) if df is None or metadata is None: return pd.DataFrame(), {"error": "Failed to parse PDF or retrieve metadata."} return df, metadata except Exception as e: print("Error in extract_and_display_data:", e) return pd.DataFrame(), {"error": str(e)} # Function to confirm and export data to Excel def confirm_and_export_data(df, metadata): try: excel_file = export_to_excel(df, metadata) if excel_file is None: return {"error": "Failed to create Excel file"} return excel_file except Exception as e: print("Error in confirm_and_export_data:", e) # Debugging output return {"error": str(e)} # Gradio Interface setup with gr.Blocks() as iface: gr.Markdown("### Simplified PDF to Excel Data Extractor for Debugging") # Inputs for PDF and OCR language selection pdf_file = gr.File(label="Upload PDF") language = gr.Dropdown(choices=["eng", "spa", "fra"], value="eng", label="OCR Language (if applicable)") # Display extracted data and metadata data_display = gr.DataFrame(label="Extracted Data Preview", interactive=True) metadata_display = gr.JSON(label="PDF Metadata") # Buttons for actions with gr.Row(): extract_btn = gr.Button("Extract and Display Data") export_btn = gr.Button("Confirm and Export to Excel") # Define extract action extract_btn.click( fn=extract_and_display_data, inputs=[pdf_file, language], outputs=[data_display, metadata_display] ) # Define export action export_btn.click( fn=confirm_and_export_data, inputs=[data_display, metadata_display], outputs=gr.File(label="Download Excel") ) iface.launch(share=True) # Set share=True to create a public link