Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Update parse_bhel.py
Browse files- parse_bhel.py +39 -53
parse_bhel.py
CHANGED
@@ -1,58 +1,44 @@
|
|
1 |
-
import
|
2 |
-
import pandas as pd
|
3 |
import pdfplumber
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces
|
22 |
-
return parts if len(parts) == len(columns) else None
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
for line in text:
|
30 |
-
# Check for Purchase Order row
|
31 |
-
if po_pattern.match(line):
|
32 |
-
po_data = line.split(' / ')
|
33 |
-
po_no = po_data[0]
|
34 |
-
po_date = po_data[1]
|
35 |
-
|
36 |
-
# Check if the line contains material data
|
37 |
-
elif material_pattern.match(line):
|
38 |
-
cleaned_data = clean_and_split_line(line)
|
39 |
-
if cleaned_data:
|
40 |
-
row_data = {
|
41 |
-
"Purchase Order No": po_no,
|
42 |
-
"Date": po_date,
|
43 |
-
"Sl No": cleaned_data[0],
|
44 |
-
"Material Description": cleaned_data[1],
|
45 |
-
"Unit": cleaned_data[2],
|
46 |
-
"Quantity": cleaned_data[3],
|
47 |
-
"Dely Qty": cleaned_data[4],
|
48 |
-
"Dely Date": cleaned_data[5],
|
49 |
-
"Unit Rate": cleaned_data[6],
|
50 |
-
"Value": cleaned_data[7],
|
51 |
-
}
|
52 |
-
data = data.append(row_data, ignore_index=True)
|
53 |
-
# Skip irrelevant lines or unalignable rows
|
54 |
-
else:
|
55 |
-
continue
|
56 |
|
57 |
-
#
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
|
|
2 |
import pdfplumber
|
3 |
+
import pandas as pd
|
4 |
|
5 |
+
def parse_bhel_pdf(pdf_file):
|
6 |
+
# Open the uploaded PDF file
|
7 |
+
with pdfplumber.open(pdf_file) as pdf:
|
8 |
+
data = []
|
9 |
+
for page in pdf.pages:
|
10 |
+
text = page.extract_text()
|
11 |
+
if text:
|
12 |
+
lines = text.split('\n')
|
13 |
+
for line in lines:
|
14 |
+
parts = line.split()
|
15 |
+
if len(parts) >= 8:
|
16 |
+
row = {
|
17 |
+
'Sl No': parts[0],
|
18 |
+
'Material Description': " ".join(parts[1:-6]),
|
19 |
+
'Unit': parts[-6],
|
20 |
+
'Quantity': parts[-5],
|
21 |
+
'Dely Qty': parts[-4],
|
22 |
+
'Dely Date': parts[-3],
|
23 |
+
'Unit Rate': parts[-2],
|
24 |
+
'Value': parts[-1]
|
25 |
+
}
|
26 |
+
data.append(row)
|
27 |
|
28 |
+
# Convert extracted data to a DataFrame
|
29 |
+
df = pd.DataFrame(data)
|
30 |
+
return df
|
|
|
|
|
31 |
|
32 |
+
def gradio_interface(pdf_file):
|
33 |
+
# Parse the PDF file and return the extracted table as an HTML table
|
34 |
+
df = parse_bhel_pdf(pdf_file.name)
|
35 |
+
return df.to_html()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
# Gradio interface
|
38 |
+
gr.Interface(
|
39 |
+
fn=gradio_interface,
|
40 |
+
inputs=gr.File(type="file", label="Upload PDF File"),
|
41 |
+
outputs="html",
|
42 |
+
title="BHEL PDF Data Extractor",
|
43 |
+
description="Upload a BHEL PDF file to extract structured data in a tabular format."
|
44 |
+
).launch()
|