Spaces:

DSatishchandra
/

PMP_PO_Extraction

Sleeping

App Files Files Community

PMP_PO_Extraction / Federal Electric

DSatishchandra

Create Federal Electric

ea15866 verified 18 days ago

raw

history blame

5.06 kB

	import pdfplumber
	import re
	import pandas as pd
	import gradio as gr

	def extract_po_data(pdf_file):
	"""
	Extracts Purchase Order data with enhanced multi-line Material Description handling,
	and cleans unwanted text or symbols.
	"""
	data = []
	purchase_order_no = None
	purchase_order_date = None

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	# Extract text from page
	lines = page.extract_text().split("\n")
	temp_row = None # Temporary row to handle multi-line descriptions

	# Extract Purchase Order Number and Date (Assume it's on the first page)
	if purchase_order_no is None: # Only extract once
	po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
	po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))

	if po_no_match:
	purchase_order_no = po_no_match.group(1)
	if po_date_match:
	purchase_order_date = po_date_match.group(1)

	# Process each line to extract data
	for line in lines:
	# Regex pattern for rows (excluding multi-line descriptions)
	pattern = r"^\s(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s$"
	match = re.match(pattern, line)

	if match:
	# If there's a match, capture the full row
	if temp_row: # Append the previous temp_row if it exists
	data.append(temp_row)
	temp_row = None
	temp_row = {
	"S. No": match[1],
	"Material No": match[2],
	"Material Description": match[3].strip(),
	"Qty": int(match[4]),
	"Unit": match[5],
	"Price": float(match[6]),
	"Delivery Date": match[7],
	"Total Value": float(match[8]),
	"Vat%": float(match[9]),
	"Amount Incl. VAT": float(match[10]),
	}
	elif temp_row:
	# If no match, treat it as a continuation of Material Description
	temp_row["Material Description"] += f" {line.strip()}"

	# Append the last row
	if temp_row:
	data.append(temp_row)

	# Create DataFrame
	df = pd.DataFrame(data)

	# Insert Purchase Order No and Purchase Order Date at the beginning
	if purchase_order_no and purchase_order_date:
	df.insert(0, "Purchase Order No", purchase_order_no)
	df.insert(1, "Purchase Order Date", purchase_order_date)

	# Filter unwanted text from Material Description
	def clean_description(description):
	# Define unwanted patterns
	unwanted_patterns = [
	r"This document is electronically approved", # Matches exact phrase
	r"does not require any signature or stamp", # Matches approval notes
	r"Total Amount Excl\. VAT.*", # Matches totals
	r"TWO THOUSAND.*ONLY", # Matches written totals
	r"&", # Removes stray symbols like `&`
	r"\.+$", # Removes trailing periods
	]
	for pattern in unwanted_patterns:
	description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
	return description

	df["Material Description"] = df["Material Description"].apply(clean_description)

	# Strip extra spaces
	df["Material Description"] = df["Material Description"].str.strip()

	return df

	def process_and_save(pdf_file, output_format):
	"""
	Processes the uploaded PDF and saves the extracted data as an Excel or CSV file.
	"""
	df = extract_po_data(pdf_file.name)

	# Save the file in the desired format
	output_file = f"output.{output_format}"
	if output_format == "csv":
	df.to_csv(output_file, index=False)
	elif output_format == "xlsx":
	df.to_excel(output_file, index=False, engine="openpyxl")

	return output_file

	# Gradio interface function
	def gradio_interface(pdf_file, output_format):
	output_file = process_and_save(pdf_file, output_format)
	return output_file

	# Gradio app interface
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")],
	outputs=gr.File(label="Download Output"),
	title="Enhanced PO Data Extractor",
	description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel."
	)

	if __name__ == "__main__":
	iface.launch()