DSatishchandra commited on
Commit
ab2a518
·
verified ·
1 Parent(s): b051e96

Update parse_bhel.py

Browse files
Files changed (1) hide show
  1. parse_bhel.py +39 -53
parse_bhel.py CHANGED
@@ -1,58 +1,44 @@
1
- import re
2
- import pandas as pd
3
  import pdfplumber
 
4
 
5
- # Define the target columns based on your table headers
6
- columns = [
7
- "Purchase Order No", "Date", "Sl No", "Material Description",
8
- "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
9
- ]
10
-
11
- # Initialize an empty DataFrame with the defined columns
12
- data = pd.DataFrame(columns=columns)
13
-
14
- # Define regex patterns to identify and parse required lines
15
- po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern
16
- material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Function to clean and split rows
19
- def clean_and_split_line(line):
20
- # Split line into components based on spaces and commas
21
- parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces
22
- return parts if len(parts) == len(columns) else None
23
 
24
- # Process the PDF and extract relevant lines
25
- with pdfplumber.open('your_pdf_file.pdf') as pdf:
26
- for page in pdf.pages:
27
- text = page.extract_text().splitlines()
28
-
29
- for line in text:
30
- # Check for Purchase Order row
31
- if po_pattern.match(line):
32
- po_data = line.split(' / ')
33
- po_no = po_data[0]
34
- po_date = po_data[1]
35
-
36
- # Check if the line contains material data
37
- elif material_pattern.match(line):
38
- cleaned_data = clean_and_split_line(line)
39
- if cleaned_data:
40
- row_data = {
41
- "Purchase Order No": po_no,
42
- "Date": po_date,
43
- "Sl No": cleaned_data[0],
44
- "Material Description": cleaned_data[1],
45
- "Unit": cleaned_data[2],
46
- "Quantity": cleaned_data[3],
47
- "Dely Qty": cleaned_data[4],
48
- "Dely Date": cleaned_data[5],
49
- "Unit Rate": cleaned_data[6],
50
- "Value": cleaned_data[7],
51
- }
52
- data = data.append(row_data, ignore_index=True)
53
- # Skip irrelevant lines or unalignable rows
54
- else:
55
- continue
56
 
57
- # Save extracted data to an Excel file
58
- data.to_excel("extracted_data.xlsx", index=False)
 
 
 
 
 
 
 
1
+ import gradio as gr
 
2
  import pdfplumber
3
+ import pandas as pd
4
 
5
+ def parse_bhel_pdf(pdf_file):
6
+ # Open the uploaded PDF file
7
+ with pdfplumber.open(pdf_file) as pdf:
8
+ data = []
9
+ for page in pdf.pages:
10
+ text = page.extract_text()
11
+ if text:
12
+ lines = text.split('\n')
13
+ for line in lines:
14
+ parts = line.split()
15
+ if len(parts) >= 8:
16
+ row = {
17
+ 'Sl No': parts[0],
18
+ 'Material Description': " ".join(parts[1:-6]),
19
+ 'Unit': parts[-6],
20
+ 'Quantity': parts[-5],
21
+ 'Dely Qty': parts[-4],
22
+ 'Dely Date': parts[-3],
23
+ 'Unit Rate': parts[-2],
24
+ 'Value': parts[-1]
25
+ }
26
+ data.append(row)
27
 
28
+ # Convert extracted data to a DataFrame
29
+ df = pd.DataFrame(data)
30
+ return df
 
 
31
 
32
+ def gradio_interface(pdf_file):
33
+ # Parse the PDF file and return the extracted table as an HTML table
34
+ df = parse_bhel_pdf(pdf_file.name)
35
+ return df.to_html()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Gradio interface
38
+ gr.Interface(
39
+ fn=gradio_interface,
40
+ inputs=gr.File(type="file", label="Upload PDF File"),
41
+ outputs="html",
42
+ title="BHEL PDF Data Extractor",
43
+ description="Upload a BHEL PDF file to extract structured data in a tabular format."
44
+ ).launch()