import pdfplumber import re import pandas as pd import gradio as gr def extract_po_data(pdf_file): """ Extracts Purchase Order data with enhanced multi-line Material Description handling, and cleans unwanted text or symbols. """ data = [] purchase_order_no = None purchase_order_date = None with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: # Extract text from page lines = page.extract_text().split("\n") temp_row = None # Temporary row to handle multi-line descriptions # Extract Purchase Order Number and Date (Assume it's on the first page) if purchase_order_no is None: # Only extract once po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines)) po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines)) if po_no_match: purchase_order_no = po_no_match.group(1) if po_date_match: purchase_order_date = po_date_match.group(1) # Process each line to extract data for line in lines: # Regex pattern for rows (excluding multi-line descriptions) pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$" match = re.match(pattern, line) if match: # If there's a match, capture the full row if temp_row: # Append the previous temp_row if it exists data.append(temp_row) temp_row = None temp_row = { "S. No": match[1], "Material No": match[2], "Material Description": match[3].strip(), "Qty": int(match[4]), "Unit": match[5], "Price": float(match[6]), "Delivery Date": match[7], "Total Value": float(match[8]), "Vat%": float(match[9]), "Amount Incl. VAT": float(match[10]), } elif temp_row: # If no match, treat it as a continuation of Material Description temp_row["Material Description"] += f" {line.strip()}" # Append the last row if temp_row: data.append(temp_row) # Create DataFrame df = pd.DataFrame(data) # Insert Purchase Order No and Purchase Order Date at the beginning if purchase_order_no and purchase_order_date: df.insert(0, "Purchase Order No", purchase_order_no) df.insert(1, "Purchase Order Date", purchase_order_date) # Filter unwanted text from Material Description def clean_description(description): # Define unwanted patterns unwanted_patterns = [ r"This document is electronically approved", # Matches exact phrase r"does not require any signature or stamp", # Matches approval notes r"Total Amount Excl\. VAT.*", # Matches totals r"TWO THOUSAND.*ONLY", # Matches written totals r"&", # Removes stray symbols like `&` r"\.+$", # Removes trailing periods ] for pattern in unwanted_patterns: description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip() return description df["Material Description"] = df["Material Description"].apply(clean_description) # Strip extra spaces df["Material Description"] = df["Material Description"].str.strip() return df def process_pdf(file): """ Processes the uploaded PDF and saves the extracted data. """ try: # Process the extracted text into a DataFrame df = extract_po_data(file.name) # Save the DataFrame to an Excel file output_path = "federal_electric_extracted_data.xlsx" df.to_excel(output_path, index=False, engine="openpyxl") return output_path, "Data extraction successful!" except Exception as e: return None, f"Error during processing: {str(e)}" # Gradio app interface iface = gr.Interface( fn=process_pdf, inputs=[gr.File(label="Upload PDF")], outputs=[ gr.File(label="Download Extracted Data"), gr.Textbox(label="Status") ], title="Enhanced PO Data Extractor", description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols." ) if __name__ == "__main__": iface.launch()