Spaces:

DSatishchandra
/

PMP_PO_Extraction

Sleeping

App Files Files Community

PMP_PO_Extraction / bhel.py

DSatishchandra

Update bhel.py

8a7c854 verified 12 months ago

raw

history blame contribute delete

4.23 kB

	import pdfplumber
	import pandas as pd
	import re
	import gradio as gr

	def extract_data(pdf_file):
	"""
	Extract data from the uploaded PDF for dynamic ranges (e.g., 10 to n).
	"""
	data = []
	columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]

	start_si = 10 # Start from SI No 10
	end_si = None # Dynamically detect the end SI No

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	full_text = page.extract_text() # Get the text content for the page
	lines = full_text.splitlines() if full_text else []

	for line in lines:
	try:
	# Parse the first column for SI No
	si_no_match = re.match(r"^\s*(\d+)\s", line)
	if not si_no_match:
	continue

	si_no = int(si_no_match.group(1))
	# Dynamically set the end SI No if higher SI Nos are found
	if end_si is None or si_no > end_si:
	end_si = si_no

	if si_no < start_si:
	continue # Skip rows below the start SI No

	# Extract Material Description and details dynamically
	material_desc = extract_material_description(full_text, si_no)

	# Extract remaining fields
	parts = line.split()
	unit = parts[3]
	quantity = int(parts[4])
	dely_qty = int(parts[5])
	dely_date = parts[6]
	unit_rate = float(parts[7])
	value = float(parts[8])

	# Append row data
	data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value])

	except (ValueError, IndexError):
	# Skip invalid rows or rows with missing data
	continue

	# Convert data to DataFrame and save as Excel
	df = pd.DataFrame(data, columns=columns)
	excel_path = "/tmp/Extracted_PO_Data_Dynamic.xlsx"
	df.to_excel(excel_path, index=False)
	return excel_path

	def extract_material_description(full_text, si_no):
	"""
	Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns.
	"""
	material_desc = ""

	# Match the specific SI No row to extract details
	si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)"
	match = re.search(si_no_pattern, full_text, re.DOTALL)
	if match:
	bps_code = match.group(1)
	material_number = match.group(2)
	material_desc += f"{bps_code}\nMaterial Number: {material_number}\n"

	# Extract HSN Code
	hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text)
	if hsn_code_match:
	hsn_code = hsn_code_match.group(1)
	material_desc += f"HSN Code: {hsn_code}\n"
	else:
	material_desc += "HSN Code: Not Found\n"

	# Extract IGST
	igst_match = re.search(r"IGST\s:\s(\d+)\s*%", full_text)
	if igst_match:
	igst = igst_match.group(1)
	material_desc += f"IGST: {igst} %"
	else:
	material_desc += "IGST: Not Found"

	return material_desc.strip()

	def process_pdf(file):
	"""
	Process the uploaded PDF and return the extracted data.
	"""
	try:
	# Extract text from the PDF
	output_path = extract_data(file.name)
	return output_path, "Data extraction successful!"
	except Exception as e:
	return None, f"Error during processing: {str(e)}"

	# Gradio Interface
	def gradio_interface(pdf_file):
	"""
	Interface function for Gradio to process the PDF and return the Excel file.
	"""
	return extract_data(pdf_file.name)

	# Define Gradio interface
	interface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.File(label="Upload PDF"),
	outputs=gr.File(label="Download Extracted Excel"),
	title="Dynamic BHEL PO Data Extractor",
	description="Upload a PDF to extract accurate Material Numbers and related data dynamically into an Excel file."
	)

	if __name__ == "__main__":
	interface.launch()