DSatishchandra commited on
Commit
cf677a1
1 Parent(s): 83a4f63

Update federal_electric.py

Browse files
Files changed (1) hide show
  1. federal_electric.py +71 -110
federal_electric.py CHANGED
@@ -1,122 +1,83 @@
1
  import pdfplumber
2
- import re
3
  import pandas as pd
4
- import gradio as gr
 
 
 
 
 
 
 
 
5
 
6
- def extract_po_data(pdf_file):
 
7
  """
8
- Extracts Purchase Order data with enhanced multi-line Material Description handling,
9
- and cleans unwanted text or symbols.
10
  """
 
11
  data = []
12
- purchase_order_no = None
13
- purchase_order_date = None
14
-
15
- with pdfplumber.open(pdf_file) as pdf:
16
- for page in pdf.pages:
17
- # Extract text from page
18
- lines = page.extract_text().split("\n")
19
- temp_row = None # Temporary row to handle multi-line descriptions
20
-
21
- # Extract Purchase Order Number and Date (Assume it's on the first page)
22
- if purchase_order_no is None: # Only extract once
23
- po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
24
- po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
25
-
26
- if po_no_match:
27
- purchase_order_no = po_no_match.group(1)
28
- if po_date_match:
29
- purchase_order_date = po_date_match.group(1)
30
-
31
- # Process each line to extract data
32
- for line in lines:
33
- # Regex pattern for rows (excluding multi-line descriptions)
34
- pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
35
- match = re.match(pattern, line)
36
-
37
- if match:
38
- # If there's a match, capture the full row
39
- if temp_row: # Append the previous temp_row if it exists
40
- data.append(temp_row)
41
- temp_row = None
42
- temp_row = {
43
- "S. No": match[1],
44
- "Material No": match[2],
45
- "Material Description": match[3].strip(),
46
- "Qty": int(match[4]),
47
- "Unit": match[5],
48
- "Price": float(match[6]),
49
- "Delivery Date": match[7],
50
- "Total Value": float(match[8]),
51
- "Vat%": float(match[9]),
52
- "Amount Incl. VAT": float(match[10]),
53
- }
54
- elif temp_row:
55
- # If no match, treat it as a continuation of Material Description
56
- temp_row["Material Description"] += f" {line.strip()}"
57
-
58
- # Append the last row
59
- if temp_row:
60
- data.append(temp_row)
61
-
62
- # Create DataFrame
63
- df = pd.DataFrame(data)
64
-
65
- # Insert Purchase Order No and Purchase Order Date at the beginning
66
- if purchase_order_no and purchase_order_date:
67
- df.insert(0, "Purchase Order No", purchase_order_no)
68
- df.insert(1, "Purchase Order Date", purchase_order_date)
69
-
70
- # Filter unwanted text from Material Description
71
- def clean_description(description):
72
- # Define unwanted patterns
73
- unwanted_patterns = [
74
- r"This document is electronically approved", # Matches exact phrase
75
- r"does not require any signature or stamp", # Matches approval notes
76
- r"Total Amount Excl\. VAT.*", # Matches totals
77
- r"TWO THOUSAND.*ONLY", # Matches written totals
78
- r"&", # Removes stray symbols like `&`
79
- r"\.+$", # Removes trailing periods
80
- ]
81
- for pattern in unwanted_patterns:
82
- description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
83
- return description
84
 
85
- df["Material Description"] = df["Material Description"].apply(clean_description)
86
-
87
- # Strip extra spaces
88
- df["Material Description"] = df["Material Description"].str.strip()
 
 
 
 
 
89
 
90
- return df
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- def process_and_save(pdf_file, output_format):
93
- """
94
- Processes the uploaded PDF and saves the extracted data as an Excel or CSV file.
95
- """
96
- df = extract_po_data(pdf_file.name)
97
-
98
- # Save the file in the desired format
99
- output_file = f"output.{output_format}"
100
- if output_format == "csv":
101
- df.to_csv(output_file, index=False)
102
- elif output_format == "xlsx":
103
- df.to_excel(output_file, index=False, engine="openpyxl")
104
-
105
- return output_file
106
 
107
- # Gradio interface function
108
- def gradio_interface(pdf_file, output_format):
109
- output_file = process_and_save(pdf_file, output_format)
110
- return output_file
 
 
 
 
 
111
 
112
- # Gradio app interface
113
- iface = gr.Interface(
114
- fn=gradio_interface,
115
- inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")],
116
- outputs=gr.File(label="Download Output"),
117
- title="Enhanced PO Data Extractor",
118
- description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel."
119
- )
120
 
121
- if __name__ == "__main__":
122
- iface.launch()
 
 
 
 
 
 
 
 
 
 
1
  import pdfplumber
 
2
  import pandas as pd
3
+ import re
4
+
5
+ # Function: Extract Text from PDF
6
+ def extract_text_from_pdf(pdf_file):
7
+ with pdfplumber.open(pdf_file.name) as pdf:
8
+ text = ""
9
+ for page in pdf.pages:
10
+ text += page.extract_text()
11
+ return text
12
 
13
+ # Function: Parse PO Items
14
+ def parse_po_items_with_filters(text):
15
  """
16
+ Parses purchase order items from the extracted text using regex with filters.
17
+ Handles split descriptions across lines and filters unwanted text.
18
  """
19
+ lines = text.splitlines()
20
  data = []
21
+ current_item = {}
22
+ description_accumulator = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ for line in lines:
25
+ # Match the start of an item row
26
+ item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
27
+ if item_match:
28
+ # Save the previous item and start a new one
29
+ if current_item:
30
+ current_item["Description"] = " ".join(description_accumulator).strip()
31
+ data.append(current_item)
32
+ description_accumulator = []
33
 
34
+ current_item = {
35
+ "Item": item_match.group("Item"),
36
+ "Description": "",
37
+ "Qty": "",
38
+ "Unit": "",
39
+ "Unit Price": "",
40
+ "Total Price": "",
41
+ }
42
+ description_accumulator.append(item_match.group("Description"))
43
+ elif current_item:
44
+ # Handle additional description lines or split descriptions
45
+ description_accumulator.append(line.strip())
46
 
47
+ # Match Qty, Unit, Unit Price, and Total Price
48
+ qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
49
+ if qty_match:
50
+ current_item["Qty"] = qty_match.group("Qty")
51
+ current_item["Unit"] = qty_match.group(2)
52
+
53
+ price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
54
+ if price_match:
55
+ current_item["Unit Price"] = price_match.group("UnitPrice")
56
+ current_item["Total Price"] = price_match.group("TotalPrice")
 
 
 
 
57
 
58
+ # Save the last item
59
+ if current_item:
60
+ current_item["Description"] = " ".join(description_accumulator).strip()
61
+ data.append(current_item)
62
+
63
+ if not data:
64
+ return None, "No items found. Please check the PDF file format."
65
+ df = pd.DataFrame(data)
66
+ return df, "Data extracted successfully."
67
 
68
+ # Function: Save to Excel
69
+ def save_to_excel(df, output_path="federal_electric_extracted_data.xlsx"):
70
+ df.to_excel(output_path, index=False)
71
+ return output_path
 
 
 
 
72
 
73
+ # Main function to process PDF
74
+ def process_pdf(file):
75
+ try:
76
+ text = extract_text_from_pdf(file)
77
+ df, status = parse_po_items_with_filters(text)
78
+ if df is not None:
79
+ output_path = save_to_excel(df)
80
+ return output_path, status
81
+ return None, status
82
+ except Exception as e:
83
+ return None, f"Error during processing: {str(e)}"