Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
import re | |
import gradio as gr | |
def extract_data(pdf_file): | |
""" | |
Extract data from the uploaded PDF for dynamic ranges (e.g., 10 to n). | |
""" | |
data = [] | |
columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"] | |
start_si = 10 # Start from SI No 10 | |
end_si = None # Dynamically detect the end SI No | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
full_text = page.extract_text() # Get the text content for the page | |
lines = full_text.splitlines() if full_text else [] | |
for line in lines: | |
try: | |
# Parse the first column for SI No | |
si_no_match = re.match(r"^\s*(\d+)\s", line) | |
if not si_no_match: | |
continue | |
si_no = int(si_no_match.group(1)) | |
# Dynamically set the end SI No if higher SI Nos are found | |
if end_si is None or si_no > end_si: | |
end_si = si_no | |
if si_no < start_si: | |
continue # Skip rows below the start SI No | |
# Extract Material Description and details dynamically | |
material_desc = extract_material_description(full_text, si_no) | |
# Extract remaining fields | |
parts = line.split() | |
unit = parts[3] | |
quantity = int(parts[4]) | |
dely_qty = int(parts[5]) | |
dely_date = parts[6] | |
unit_rate = float(parts[7]) | |
value = float(parts[8]) | |
# Append row data | |
data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value]) | |
except (ValueError, IndexError): | |
# Skip invalid rows or rows with missing data | |
continue | |
# Convert data to DataFrame and save as Excel | |
df = pd.DataFrame(data, columns=columns) | |
excel_path = "/tmp/Extracted_PO_Data_Dynamic.xlsx" | |
df.to_excel(excel_path, index=False) | |
return excel_path | |
def extract_material_description(full_text, si_no): | |
""" | |
Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns. | |
""" | |
material_desc = "" | |
# Match the specific SI No row to extract details | |
si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)" | |
match = re.search(si_no_pattern, full_text, re.DOTALL) | |
if match: | |
bps_code = match.group(1) | |
material_number = match.group(2) | |
material_desc += f"{bps_code}\nMaterial Number: {material_number}\n" | |
# Extract HSN Code | |
hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text) | |
if hsn_code_match: | |
hsn_code = hsn_code_match.group(1) | |
material_desc += f"HSN Code: {hsn_code}\n" | |
else: | |
material_desc += "HSN Code: Not Found\n" | |
# Extract IGST | |
igst_match = re.search(r"IGST\s*:\s*(\d+)\s*%", full_text) | |
if igst_match: | |
igst = igst_match.group(1) | |
material_desc += f"IGST: {igst} %" | |
else: | |
material_desc += "IGST: Not Found" | |
return material_desc.strip() | |
def process_pdf(file): | |
""" | |
Process the uploaded PDF and return the extracted data. | |
""" | |
try: | |
# Extract text from the PDF | |
output_path = extract_data(file.name) | |
return output_path, "Data extraction successful!" | |
except Exception as e: | |
return None, f"Error during processing: {str(e)}" | |
# Gradio Interface | |
def gradio_interface(pdf_file): | |
""" | |
Interface function for Gradio to process the PDF and return the Excel file. | |
""" | |
return extract_data(pdf_file.name) | |
# Define Gradio interface | |
interface = gr.Interface( | |
fn=gradio_interface, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Extracted Excel"), | |
title="Dynamic BHEL PO Data Extractor", | |
description="Upload a PDF to extract accurate Material Numbers and related data dynamically into an Excel file." | |
) | |
if __name__ == "__main__": | |
interface.launch() | |