Spaces:

neerajkalyank
/

p2e

Sleeping

App Files Files Community

neerajkalyank commited on Nov 9

Commit

ab51c22

•

1 Parent(s): 4207427

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -30

app.py CHANGED Viewed

@@ -1,45 +1,111 @@
 import pandas as pd
 import fitz  # PyMuPDF for PDF reading
 import pytesseract  # Tesseract for OCR
 import re
 def ai_parse_pdf(pdf_file, language="eng"):
     try:
-        # Open the PDF
-        if hasattr(pdf_file, 'read'):
             pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
-        else:
             pdf_document = fitz.open(pdf_file)
-        # Define expected columns (can be flexible depending on the detected format)
-        expected_columns = ["Purchase Order", "Pos.", "Item Code", "Unit", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"]
-        all_data = []
-        detected_columns = []
-        # Process each page
-        for page_num in range(pdf_document.page_count):
-            page = pdf_document.load_page(page_num)
-            text = page.get_text("text")
-            # Detect column headers using keywords or patterns
-            headers = re.findall(r"(Purchase Order|Pos.|Item Code|Unit|Delivery Date|Quantity|Basic Price|Amount|SUB TOTAL)", text)
-            detected_columns = headers if headers else expected_columns  # Fallback if no headers detected
-            # Extract data rows based on known patterns or structure (example pattern here)
-            data_rows = re.findall(r"(your row matching pattern here)", text)
-            # Append parsed data to all_data
-            for row in data_rows:
-                row_data = row.split()  # Adjust split method as per data format
-                all_data.append(row_data)
-        # Create DataFrame with dynamically detected columns
-        df = pd.DataFrame(all_data, columns=detected_columns)
-        # Reorder to expected format, filling missing columns if necessary
-        df = df.reindex(columns=expected_columns, fill_value="")
-        return df, pdf_document.metadata
     except Exception as e:
-        print("Error in ai_parse_pdf:", e)
-        return None, None

+import gradio as gr
 import pandas as pd
 import fitz  # PyMuPDF for PDF reading
 import pytesseract  # Tesseract for OCR
+from io import BytesIO
 import re
+from tqdm import tqdm  # For progress indicators
+# Set Tesseract path if necessary
+pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'  # Update this path as necessary
 def ai_parse_pdf(pdf_file, language="eng"):
     try:
+        # Check if pdf_file is a file-like object or path
+        if hasattr(pdf_file, 'read'):  # If it's a file-like object (e.g., BytesIO)
             pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        elif isinstance(pdf_file, str):  # If it's a path in string format
             pdf_document = fitz.open(pdf_file)
+        else:
+            raise ValueError("Unsupported file type. Please upload a valid PDF file.")
+        # Retrieve metadata
+        metadata = pdf_document.metadata or {}
+        print("Retrieved Metadata:", metadata)  # Debugging step
+        # Prepare DataFrame with dummy data to simplify debugging
+        data = [{"Column1": "Test1", "Column2": "Test2"}]  # Mock data for debugging
+        df = pd.DataFrame(data)
+        return df, metadata
+    except Exception as e:
+        print("Error in ai_parse_pdf:", e)  # Debugging output
+        return None, None
+def export_to_excel(df, metadata):
+    try:
+        output = BytesIO()
+        with pd.ExcelWriter(output, engine='openpyxl') as writer:
+            # Check if df is valid
+            if df is None or df.empty:
+                raise ValueError("DataFrame is empty or invalid.")
+            df.to_excel(writer, index=False, sheet_name="Extracted Data")
+            # Check metadata and write it only if it's a valid dictionary
+            if metadata and isinstance(metadata, dict):
+                metadata_df = pd.DataFrame.from_dict(metadata, orient="index", columns=["Value"])
+                metadata_df.to_excel(writer, sheet_name="Metadata")
+            else:
+                print("Metadata is empty or invalid, skipping metadata export.")
+        output.seek(0)
+        return output
+    except Exception as e:
+        print("Error in export_to_excel:", e)  # Debugging output
+        return None
+# Gradio function to extract, display, and cleanse data
+def extract_and_display_data(pdf_file, language):
+    try:
+        df, metadata = ai_parse_pdf(pdf_file, language)
+        if df is None or metadata is None:
+            return pd.DataFrame(), {"error": "Failed to parse PDF or retrieve metadata."}
+        return df, metadata
     except Exception as e:
+        print("Error in extract_and_display_data:", e)
+        return pd.DataFrame(), {"error": str(e)}
+# Function to confirm and export data to Excel
+def confirm_and_export_data(df, metadata):
+    try:
+        excel_file = export_to_excel(df, metadata)
+        if excel_file is None:
+            return {"error": "Failed to create Excel file"}
+        return excel_file
+    except Exception as e:
+        print("Error in confirm_and_export_data:", e)  # Debugging output
+        return {"error": str(e)}
+# Gradio Interface setup
+with gr.Blocks() as iface:
+    gr.Markdown("### Simplified PDF to Excel Data Extractor for Debugging")
+    # Inputs for PDF and OCR language selection
+    pdf_file = gr.File(label="Upload PDF")
+    language = gr.Dropdown(choices=["eng", "spa", "fra"], value="eng", label="OCR Language (if applicable)")
+    # Display extracted data and metadata
+    data_display = gr.DataFrame(label="Extracted Data Preview", interactive=True)
+    metadata_display = gr.JSON(label="PDF Metadata")
+    # Buttons for actions
+    with gr.Row():
+        extract_btn = gr.Button("Extract and Display Data")
+        export_btn = gr.Button("Confirm and Export to Excel")
+    # Define extract action
+    extract_btn.click(
+        fn=extract_and_display_data,
+        inputs=[pdf_file, language],
+        outputs=[data_display, metadata_display]
+    )
+    # Define export action
+    export_btn.click(
+        fn=confirm_and_export_data,
+        inputs=[data_display, metadata_display],
+        outputs=gr.File(label="Download Excel")
+    )
+iface.launch(share=True)  # Set share=True to create a public link