Spaces:

DSatishchandra
/

PMP_PO_Extraction

Sleeping

App Files Files Community

DSatishchandra commited on about 1 month ago

Commit

cf677a1

•

1 Parent(s): 83a4f63

Update federal_electric.py

Browse files

Files changed (1) hide show

federal_electric.py +71 -110

federal_electric.py CHANGED Viewed

@@ -1,122 +1,83 @@
 import pdfplumber
-import re
 import pandas as pd
-import gradio as gr
-def extract_po_data(pdf_file):
     """
-    Extracts Purchase Order data with enhanced multi-line Material Description handling,
-    and cleans unwanted text or symbols.
     """
     data = []
-    purchase_order_no = None
-    purchase_order_date = None
-    with pdfplumber.open(pdf_file) as pdf:
-        for page in pdf.pages:
-            # Extract text from page
-            lines = page.extract_text().split("\n")
-            temp_row = None  # Temporary row to handle multi-line descriptions
-            # Extract Purchase Order Number and Date (Assume it's on the first page)
-            if purchase_order_no is None:  # Only extract once
-                po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
-                po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
-                if po_no_match:
-                    purchase_order_no = po_no_match.group(1)
-                if po_date_match:
-                    purchase_order_date = po_date_match.group(1)
-            # Process each line to extract data
-            for line in lines:
-                # Regex pattern for rows (excluding multi-line descriptions)
-                pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
-                match = re.match(pattern, line)
-                if match:
-                    # If there's a match, capture the full row
-                    if temp_row:  # Append the previous temp_row if it exists
-                        data.append(temp_row)
-                        temp_row = None
-                    temp_row = {
-                        "S. No": match[1],
-                        "Material No": match[2],
-                        "Material Description": match[3].strip(),
-                        "Qty": int(match[4]),
-                        "Unit": match[5],
-                        "Price": float(match[6]),
-                        "Delivery Date": match[7],
-                        "Total Value": float(match[8]),
-                        "Vat%": float(match[9]),
-                        "Amount Incl. VAT": float(match[10]),
-                    }
-                elif temp_row:
-                    # If no match, treat it as a continuation of Material Description
-                    temp_row["Material Description"] += f" {line.strip()}"
-            # Append the last row
-            if temp_row:
-                data.append(temp_row)
-    # Create DataFrame
-    df = pd.DataFrame(data)
-    # Insert Purchase Order No and Purchase Order Date at the beginning
-    if purchase_order_no and purchase_order_date:
-        df.insert(0, "Purchase Order No", purchase_order_no)
-        df.insert(1, "Purchase Order Date", purchase_order_date)
-    # Filter unwanted text from Material Description
-    def clean_description(description):
-        # Define unwanted patterns
-        unwanted_patterns = [
-            r"This document is electronically approved",  # Matches exact phrase
-            r"does not require any signature or stamp",   # Matches approval notes
-            r"Total Amount Excl\. VAT.*",                # Matches totals
-            r"TWO THOUSAND.*ONLY",                       # Matches written totals
-            r"&",                                        # Removes stray symbols like `&`
-            r"\.+$",                                     # Removes trailing periods
-        ]
-        for pattern in unwanted_patterns:
-            description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
-        return description
-    df["Material Description"] = df["Material Description"].apply(clean_description)
-    # Strip extra spaces
-    df["Material Description"] = df["Material Description"].str.strip()
-    return df
-def process_and_save(pdf_file, output_format):
-    """
-    Processes the uploaded PDF and saves the extracted data as an Excel or CSV file.
-    """
-    df = extract_po_data(pdf_file.name)
-    # Save the file in the desired format
-    output_file = f"output.{output_format}"
-    if output_format == "csv":
-        df.to_csv(output_file, index=False)
-    elif output_format == "xlsx":
-        df.to_excel(output_file, index=False, engine="openpyxl")
-    return output_file
-# Gradio interface function
-def gradio_interface(pdf_file, output_format):
-    output_file = process_and_save(pdf_file, output_format)
-    return output_file
-# Gradio app interface
-iface = gr.Interface(
-    fn=gradio_interface,
-    inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")],
-    outputs=gr.File(label="Download Output"),
-    title="Enhanced PO Data Extractor",
-    description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel."
-)
-if __name__ == "__main__":
-    iface.launch()

 import pdfplumber
 import pandas as pd
+import re
+# Function: Extract Text from PDF
+def extract_text_from_pdf(pdf_file):
+    with pdfplumber.open(pdf_file.name) as pdf:
+        text = ""
+        for page in pdf.pages:
+            text += page.extract_text()
+    return text
+# Function: Parse PO Items
+def parse_po_items_with_filters(text):
     """
+    Parses purchase order items from the extracted text using regex with filters.
+    Handles split descriptions across lines and filters unwanted text.
     """
+    lines = text.splitlines()
     data = []
+    current_item = {}
+    description_accumulator = []
+    for line in lines:
+        # Match the start of an item row
+        item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
+        if item_match:
+            # Save the previous item and start a new one
+            if current_item:
+                current_item["Description"] = " ".join(description_accumulator).strip()
+                data.append(current_item)
+                description_accumulator = []
+            current_item = {
+                "Item": item_match.group("Item"),
+                "Description": "",
+                "Qty": "",
+                "Unit": "",
+                "Unit Price": "",
+                "Total Price": "",
+            }
+            description_accumulator.append(item_match.group("Description"))
+        elif current_item:
+            # Handle additional description lines or split descriptions
+            description_accumulator.append(line.strip())
+        # Match Qty, Unit, Unit Price, and Total Price
+        qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
+        if qty_match:
+            current_item["Qty"] = qty_match.group("Qty")
+            current_item["Unit"] = qty_match.group(2)
+        price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
+        if price_match:
+            current_item["Unit Price"] = price_match.group("UnitPrice")
+            current_item["Total Price"] = price_match.group("TotalPrice")
+    # Save the last item
+    if current_item:
+        current_item["Description"] = " ".join(description_accumulator).strip()
+        data.append(current_item)
+    if not data:
+        return None, "No items found. Please check the PDF file format."
+    df = pd.DataFrame(data)
+    return df, "Data extracted successfully."
+# Function: Save to Excel
+def save_to_excel(df, output_path="federal_electric_extracted_data.xlsx"):
+    df.to_excel(output_path, index=False)
+    return output_path
+# Main function to process PDF
+def process_pdf(file):
+    try:
+        text = extract_text_from_pdf(file)
+        df, status = parse_po_items_with_filters(text)
+        if df is not None:
+            output_path = save_to_excel(df)
+            return output_path, status
+        return None, status
+    except Exception as e:
+        return None, f"Error during processing: {str(e)}"