Spaces:
Sleeping
Sleeping
neerajkalyank
commited on
Commit
•
d97cfeb
1
Parent(s):
938ff71
Update app.py
Browse files
app.py
CHANGED
@@ -1,80 +1,62 @@
|
|
1 |
-
import gradio as gr
|
2 |
import pdfplumber
|
3 |
import pandas as pd
|
4 |
-
import re
|
5 |
from io import BytesIO
|
6 |
-
import
|
|
|
7 |
|
8 |
def extract_data_from_pdf(pdf_file):
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
# Open the PDF file with pdfplumber
|
13 |
with pdfplumber.open(pdf_file) as pdf:
|
14 |
for page in pdf.pages:
|
15 |
-
# Extract text from each page
|
16 |
text = page.extract_text()
|
17 |
-
if text:
|
18 |
-
print(f"Extracted text from page {page.page_number}:\n{text}\n") # Debugging: Print extracted text
|
19 |
-
text_data.append(text)
|
20 |
-
|
21 |
-
# Initialize list for parsed data
|
22 |
-
data = []
|
23 |
-
|
24 |
-
# Define regular expressions for parsing rows
|
25 |
-
row_pattern = re.compile(
|
26 |
-
r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
|
27 |
-
)
|
28 |
-
|
29 |
-
# Process and structure extracted text
|
30 |
-
for text in text_data:
|
31 |
-
for line in text.split('\n'):
|
32 |
-
# Apply row pattern to each line
|
33 |
-
match = row_pattern.search(line)
|
34 |
-
if match:
|
35 |
-
row = match.groupdict()
|
36 |
-
row["description"] = row["description"].strip() # Clean description
|
37 |
-
row["quantity"] = float(row["quantity"])
|
38 |
-
row["price"] = float(row["price"])
|
39 |
-
row["discount"] = float(row["discount"])
|
40 |
-
row["amount"] = float(row["amount"])
|
41 |
-
|
42 |
-
# Append extracted row to data
|
43 |
-
data.append(row)
|
44 |
-
|
45 |
-
# Create DataFrame if data was extracted
|
46 |
-
if data:
|
47 |
-
df = pd.DataFrame(data)
|
48 |
-
df.columns = [
|
49 |
-
"Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
|
50 |
-
"Discount", "Currency", "Amount"
|
51 |
-
]
|
52 |
-
|
53 |
-
# Save the DataFrame to a temporary Excel file
|
54 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
55 |
-
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
|
56 |
-
df.to_excel(writer, index=False, sheet_name="Extracted Data")
|
57 |
-
|
58 |
-
return temp_file.name
|
59 |
-
else:
|
60 |
-
# If no data was found, create a blank Excel file
|
61 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
62 |
-
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
|
63 |
-
pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
|
64 |
-
|
65 |
-
return temp_file.name
|
66 |
|
67 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
iface = gr.Interface(
|
69 |
fn=extract_data_from_pdf,
|
70 |
inputs=gr.File(label="Upload PDF"),
|
71 |
outputs=gr.File(label="Download Excel"),
|
72 |
-
title="
|
73 |
-
description=
|
74 |
-
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
|
75 |
-
"The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
|
76 |
-
"No additional calculations are performed; it simply extracts the data as it appears."
|
77 |
-
),
|
78 |
)
|
79 |
|
80 |
iface.launch()
|
|
|
|
|
1 |
import pdfplumber
|
2 |
import pandas as pd
|
|
|
3 |
from io import BytesIO
|
4 |
+
import re
|
5 |
+
import gradio as gr
|
6 |
|
7 |
def extract_data_from_pdf(pdf_file):
|
8 |
+
data = []
|
9 |
+
po_number = None
|
10 |
|
|
|
11 |
with pdfplumber.open(pdf_file) as pdf:
|
12 |
for page in pdf.pages:
|
|
|
13 |
text = page.extract_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# Extract PO number once (if not already extracted)
|
16 |
+
if po_number is None:
|
17 |
+
po_match = re.search(r"Purchase Order : (\w+)", text)
|
18 |
+
if po_match:
|
19 |
+
po_number = po_match.group(1)
|
20 |
+
|
21 |
+
# Regex pattern to match the row data
|
22 |
+
row_pattern = re.compile(
|
23 |
+
r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
|
24 |
+
)
|
25 |
+
|
26 |
+
# Find all rows matching the pattern
|
27 |
+
for match in row_pattern.finditer(text):
|
28 |
+
pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
|
29 |
+
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
|
30 |
+
sub_total = sub_total_match.group(1) if sub_total_match else ""
|
31 |
+
|
32 |
+
data.append({
|
33 |
+
"Purchase Order": po_number,
|
34 |
+
"Pos.": pos,
|
35 |
+
"Item Code": item_code,
|
36 |
+
"Unit": unit,
|
37 |
+
"Delivery Date": delivery_date,
|
38 |
+
"Quantity": quantity,
|
39 |
+
"Basic Price": basic_price,
|
40 |
+
"Amount": amount,
|
41 |
+
"SUB TOTAL": sub_total
|
42 |
+
})
|
43 |
+
|
44 |
+
# Convert the data to a DataFrame
|
45 |
+
df = pd.DataFrame(data)
|
46 |
+
output = BytesIO()
|
47 |
+
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
|
48 |
+
df.to_excel(writer, index=False, sheet_name="Extracted Data")
|
49 |
+
output.seek(0)
|
50 |
+
|
51 |
+
return output
|
52 |
+
|
53 |
+
# Gradio Interface
|
54 |
iface = gr.Interface(
|
55 |
fn=extract_data_from_pdf,
|
56 |
inputs=gr.File(label="Upload PDF"),
|
57 |
outputs=gr.File(label="Download Excel"),
|
58 |
+
title="PDF Data Extractor",
|
59 |
+
description="Extract structured data from a PDF and output it as an Excel file."
|
|
|
|
|
|
|
|
|
60 |
)
|
61 |
|
62 |
iface.launch()
|