Spaces:

DSatishchandra
/

PMP_PO_Extraction

Running

App Files Files Community

PMP_PO_Extraction / federal_electric.py

DSatishchandra

Update federal_electric.py

6bf4435 verified 18 days ago

raw

history blame

4.92 kB

	import pdfplumber
	import re
	import pandas as pd
	import gradio as gr

	def extract_po_data(pdf_file):
	"""
	Extracts Purchase Order data with enhanced multi-line Material Description handling,
	and cleans unwanted text or symbols.
	"""
	data = []
	purchase_order_no = None
	purchase_order_date = None

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	# Extract text from page
	lines = page.extract_text().split("\n")
	temp_row = None # Temporary row to handle multi-line descriptions

	# Extract Purchase Order Number and Date (Assume it's on the first page)
	if purchase_order_no is None: # Only extract once
	po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
	po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))

	if po_no_match:
	purchase_order_no = po_no_match.group(1)
	if po_date_match:
	purchase_order_date = po_date_match.group(1)

	# Process each line to extract data
	for line in lines:
	# Regex pattern for rows (excluding multi-line descriptions)
	pattern = r"^\s(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s$"
	match = re.match(pattern, line)

	if match:
	# If there's a match, capture the full row
	if temp_row: # Append the previous temp_row if it exists
	data.append(temp_row)
	temp_row = None
	temp_row = {
	"S. No": match[1],
	"Material No": match[2],
	"Material Description": match[3].strip(),
	"Qty": int(match[4]),
	"Unit": match[5],
	"Price": float(match[6]),
	"Delivery Date": match[7],
	"Total Value": float(match[8]),
	"Vat%": float(match[9]),
	"Amount Incl. VAT": float(match[10]),
	}
	elif temp_row:
	# If no match, treat it as a continuation of Material Description
	temp_row["Material Description"] += f" {line.strip()}"

	# Append the last row
	if temp_row:
	data.append(temp_row)

	# Create DataFrame
	df = pd.DataFrame(data)

	# Insert Purchase Order No and Purchase Order Date at the beginning
	if purchase_order_no and purchase_order_date:
	df.insert(0, "Purchase Order No", purchase_order_no)
	df.insert(1, "Purchase Order Date", purchase_order_date)

	# Filter unwanted text from Material Description
	def clean_description(description):
	# Define unwanted patterns
	unwanted_patterns = [
	r"This document is electronically approved", # Matches exact phrase
	r"does not require any signature or stamp", # Matches approval notes
	r"Total Amount Excl\. VAT.*", # Matches totals
	r"TWO THOUSAND.*ONLY", # Matches written totals
	r"&", # Removes stray symbols like `&`
	r"\.+$", # Removes trailing periods
	]
	for pattern in unwanted_patterns:
	description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
	return description

	df["Material Description"] = df["Material Description"].apply(clean_description)

	# Strip extra spaces
	df["Material Description"] = df["Material Description"].str.strip()

	return df

	def process_pdf(file):
	"""
	Processes the uploaded PDF and saves the extracted data.
	"""
	try:
	# Process the extracted text into a DataFrame
	df = extract_po_data(file.name)

	# Save the DataFrame to an Excel file
	output_path = "federal_electric_extracted_data.xlsx"
	df.to_excel(output_path, index=False, engine="openpyxl")
	return output_path, "Data extraction successful!"
	except Exception as e:
	return None, f"Error during processing: {str(e)}"

	# Gradio app interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=[gr.File(label="Upload PDF")],
	outputs=[
	gr.File(label="Download Extracted Data"),
	gr.Textbox(label="Status")
	],
	title="Enhanced PO Data Extractor",
	description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols."
	)

	if __name__ == "__main__":
	iface.launch()