Spaces:

DSatishchandra
/

PMP_PO_Extraction

Sleeping

App Files Files Community

DSatishchandra commited on 18 days ago

Commit

ea15866

•

1 Parent(s): e0d3587

Create Federal Electric

Browse files

Files changed (1) hide show

Federal Electric +122 -0

Federal Electric ADDED Viewed

	@@ -0,0 +1,122 @@

+import pdfplumber
+import re
+import pandas as pd
+import gradio as gr
+def extract_po_data(pdf_file):
+    """
+    Extracts Purchase Order data with enhanced multi-line Material Description handling,
+    and cleans unwanted text or symbols.
+    """
+    data = []
+    purchase_order_no = None
+    purchase_order_date = None
+    with pdfplumber.open(pdf_file) as pdf:
+        for page in pdf.pages:
+            # Extract text from page
+            lines = page.extract_text().split("\n")
+            temp_row = None  # Temporary row to handle multi-line descriptions
+            # Extract Purchase Order Number and Date (Assume it's on the first page)
+            if purchase_order_no is None:  # Only extract once
+                po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
+                po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
+                if po_no_match:
+                    purchase_order_no = po_no_match.group(1)
+                if po_date_match:
+                    purchase_order_date = po_date_match.group(1)
+            # Process each line to extract data
+            for line in lines:
+                # Regex pattern for rows (excluding multi-line descriptions)
+                pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
+                match = re.match(pattern, line)
+                if match:
+                    # If there's a match, capture the full row
+                    if temp_row:  # Append the previous temp_row if it exists
+                        data.append(temp_row)
+                        temp_row = None
+                    temp_row = {
+                        "S. No": match[1],
+                        "Material No": match[2],
+                        "Material Description": match[3].strip(),
+                        "Qty": int(match[4]),
+                        "Unit": match[5],
+                        "Price": float(match[6]),
+                        "Delivery Date": match[7],
+                        "Total Value": float(match[8]),
+                        "Vat%": float(match[9]),
+                        "Amount Incl. VAT": float(match[10]),
+                    }
+                elif temp_row:
+                    # If no match, treat it as a continuation of Material Description
+                    temp_row["Material Description"] += f" {line.strip()}"
+            # Append the last row
+            if temp_row:
+                data.append(temp_row)
+    # Create DataFrame
+    df = pd.DataFrame(data)
+    # Insert Purchase Order No and Purchase Order Date at the beginning
+    if purchase_order_no and purchase_order_date:
+        df.insert(0, "Purchase Order No", purchase_order_no)
+        df.insert(1, "Purchase Order Date", purchase_order_date)
+    # Filter unwanted text from Material Description
+    def clean_description(description):
+        # Define unwanted patterns
+        unwanted_patterns = [
+            r"This document is electronically approved",  # Matches exact phrase
+            r"does not require any signature or stamp",   # Matches approval notes
+            r"Total Amount Excl\. VAT.*",                # Matches totals
+            r"TWO THOUSAND.*ONLY",                       # Matches written totals
+            r"&",                                        # Removes stray symbols like `&`
+            r"\.+$",                                     # Removes trailing periods
+        ]
+        for pattern in unwanted_patterns:
+            description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
+        return description
+    df["Material Description"] = df["Material Description"].apply(clean_description)
+    # Strip extra spaces
+    df["Material Description"] = df["Material Description"].str.strip()
+    return df
+def process_and_save(pdf_file, output_format):
+    """
+    Processes the uploaded PDF and saves the extracted data as an Excel or CSV file.
+    """
+    df = extract_po_data(pdf_file.name)
+    # Save the file in the desired format
+    output_file = f"output.{output_format}"
+    if output_format == "csv":
+        df.to_csv(output_file, index=False)
+    elif output_format == "xlsx":
+        df.to_excel(output_file, index=False, engine="openpyxl")
+    return output_file
+# Gradio interface function
+def gradio_interface(pdf_file, output_format):
+    output_file = process_and_save(pdf_file, output_format)
+    return output_file
+# Gradio app interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")],
+    outputs=gr.File(label="Download Output"),
+    title="Enhanced PO Data Extractor",
+    description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel."
+)
+if __name__ == "__main__":
+    iface.launch()