Spaces:
Runtime error
Runtime error
File size: 4,615 Bytes
d2251a9 4482309 6d9fb15 d2251a9 71106bd cf4d471 6e8ab5b d2251a9 6d9fb15 6e8ab5b d2251a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import pdfplumber
import pandas as pd
import gradio as gr
import re
import tempfile
# Define function to extract data
def extract_data(pdf_file):
data = []
columns = ["Purchase Order No", "Date", "SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
# Set default values
purchase_order_no = "Not Found"
purchase_order_date = "Not Found"
try:
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text()
if not text:
continue # Skip pages without text
lines = text.splitlines()
# Attempt to dynamically extract Purchase Order No and Date from the first page
for line in lines:
# Search for Purchase Order No
po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line, re.IGNORECASE)
if po_match:
purchase_order_no = po_match.group(1)
# Search for Date
date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line, re.IGNORECASE)
if date_match:
purchase_order_date = date_match.group(1)
# Stop if both values are found
if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
break
# Process lines to extract row data, looking for rows that start with SI No
for line in lines:
try:
# Match lines that start with an SI number (e.g., "10", "20")
si_no_match = re.match(r'^(\d+)\s', line)
if si_no_match:
parts = line.split()
# Extract SI No
si_no = parts[0]
# Extract Material Number and format the Material Description
material_number = parts[2] if len(parts) > 2 else "Unknown"
material_desc = f"BPS 017507\nMaterial Number: {material_number}\nHSN Code: 8310\nIGST: 18%"
# Extract Unit, Quantity, Dely Qty, Dely Date, Unit Rate, and Value
unit = parts[3] if len(parts) > 3 else "NO" # Default to "NO" if not found
quantity = int(parts[4]) if len(parts) > 4 else 0
dely_qty = int(parts[5]) if len(parts) > 5 else 0
dely_date = parts[6] if len(parts) > 6 else "Unknown"
unit_rate = float(parts[7]) if len(parts) > 7 else 0.0
value = float(parts[8]) if len(parts) > 8 else 0.0
# Append extracted data in the specified order
data.append([
purchase_order_no,
purchase_order_date,
si_no,
material_desc,
unit,
quantity,
dely_qty,
dely_date,
unit_rate,
value
])
except (ValueError, IndexError) as e:
print(f"Error processing line: {line} - {e}")
continue # Skip lines that do not match the expected format
# Convert data to DataFrame and save as Excel
df = pd.DataFrame(data, columns=columns)
# Generate a temporary file path for the Excel file
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file:
excel_path = tmp_file.name
df.to_excel(excel_path, index=False)
except Exception as e:
print(f"An error occurred while processing the PDF: {e}")
return None
# Log warning if data was not found for Purchase Order No or Date
if purchase_order_no == "Not Found" or purchase_order_date == "Not Found":
print("Warning: 'Purchase Order No' or 'Date' was not found in the PDF.")
return excel_path
# Set up Gradio interface
iface = gr.Interface(
fn=extract_data,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download Excel"),
title="PDF Data Extractor"
)
# Launch the app
iface.launch()
|