dschandra commited on
Commit
21b7e40
·
verified ·
1 Parent(s): 5224ad4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -55
app.py CHANGED
@@ -36,39 +36,10 @@ def clean_description(description, item_number=None):
36
 
37
  return description.strip()
38
 
39
- def format_description(description):
40
- """
41
- Formats the description into multiple lines based on predefined patterns.
42
- Args:
43
- description (str): Raw description string.
44
- Returns:
45
- str: Formatted description with line breaks.
46
- """
47
- # Extract parts of the description based on the expected structure
48
- line1_match = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
49
- line2_match = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
50
- line3_match = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
51
- line4_match = re.search(r"With Serial No:.*", description)
52
-
53
- # Construct the formatted description
54
- formatted_description = []
55
- if line1_match:
56
- formatted_description.append(line1_match.group())
57
- if line2_match:
58
- formatted_description.append(line2_match.group().strip())
59
- if line3_match:
60
- formatted_description.append(line3_match.group().strip())
61
- if line4_match:
62
- formatted_description.append(line4_match.group().strip())
63
-
64
- # Join the lines with a newline character
65
- return "\n".join(formatted_description)
66
-
67
-
68
  def parse_po_items_with_filters(text):
69
  """
70
  Parses purchase order items from the extracted text using regex with filters.
71
- Ensures items are not merged and handles split descriptions across lines.
72
  Args:
73
  text (str): Extracted text from the PDF.
74
  Returns:
@@ -80,16 +51,14 @@ def parse_po_items_with_filters(text):
80
  description_accumulator = []
81
 
82
  for line in lines:
83
- # Match the start of an item row (strict boundary for items)
84
  item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
85
  if item_match:
86
  # Save the previous item
87
  if current_item:
88
- current_item["Description"] = clean_description(
89
- " ".join(description_accumulator).strip(),
90
- item_number=int(current_item["Item"]),
91
  )
92
- current_item["Description"] = format_description(current_item["Description"])
93
  data.append(current_item)
94
  description_accumulator = []
95
 
@@ -107,7 +76,7 @@ def parse_po_items_with_filters(text):
107
  # Accumulate additional lines for the current item's description
108
  description_accumulator.append(line.strip())
109
 
110
- # Match Qty, Unit, Unit Price, and Total Price
111
  qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
112
  if qty_match:
113
  current_item["Qty"] = qty_match.group("Qty")
@@ -120,28 +89,12 @@ def parse_po_items_with_filters(text):
120
 
121
  # Save the last item
122
  if current_item:
123
- current_item["Description"] = clean_description(
124
- " ".join(description_accumulator).strip(),
125
- item_number=int(current_item["Item"]),
126
  )
127
- current_item["Description"] = format_description(current_item["Description"])
128
  data.append(current_item)
129
 
130
- # Clean specific patterns from item 7
131
- for item in data:
132
- if item["Item"] == "7":
133
- # Remove unwanted text from description
134
- item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
135
- # Extract and assign unit price and total price if not already extracted
136
- if not item["Unit Price"] and not item["Total Price"]:
137
- price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
138
- if price_match:
139
- item["Unit Price"] = price_match.group("UnitPrice")
140
- item["Total Price"] = price_match.group("TotalPrice")
141
- # Remove extracted price from description
142
- item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
143
-
144
- # Remove empty descriptions or invalid rows
145
  data = [row for row in data if row["Description"]]
146
 
147
  # Return data as a DataFrame
@@ -151,6 +104,33 @@ def parse_po_items_with_filters(text):
151
  return df, "Data extracted successfully."
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # Function: Save to Excel
156
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
 
36
 
37
  return description.strip()
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def parse_po_items_with_filters(text):
40
  """
41
  Parses purchase order items from the extracted text using regex with filters.
42
+ Ensures items are formatted correctly into rows and columns.
43
  Args:
44
  text (str): Extracted text from the PDF.
45
  Returns:
 
51
  description_accumulator = []
52
 
53
  for line in lines:
54
+ # Match the start of a new item row (e.g., Item No. followed by description)
55
  item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
56
  if item_match:
57
  # Save the previous item
58
  if current_item:
59
+ current_item["Description"] = format_description(
60
+ " ".join(description_accumulator).strip()
 
61
  )
 
62
  data.append(current_item)
63
  description_accumulator = []
64
 
 
76
  # Accumulate additional lines for the current item's description
77
  description_accumulator.append(line.strip())
78
 
79
+ # Match Quantity, Unit, Unit Price, and Total Price
80
  qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
81
  if qty_match:
82
  current_item["Qty"] = qty_match.group("Qty")
 
89
 
90
  # Save the last item
91
  if current_item:
92
+ current_item["Description"] = format_description(
93
+ " ".join(description_accumulator).strip()
 
94
  )
 
95
  data.append(current_item)
96
 
97
+ # Remove empty rows
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  data = [row for row in data if row["Description"]]
99
 
100
  # Return data as a DataFrame
 
104
  return df, "Data extracted successfully."
105
 
106
 
107
+ def format_description(description):
108
+ """
109
+ Formats the description into multiple lines based on patterns.
110
+ Args:
111
+ description (str): Raw description text.
112
+ Returns:
113
+ str: Formatted description.
114
+ """
115
+ # Break the description into multiple lines
116
+ line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
117
+ line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
118
+ line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
119
+ line4 = re.search(r"With Serial No:.*", description)
120
+
121
+ # Build the formatted description
122
+ lines = []
123
+ if line1:
124
+ lines.append(line1.group().strip())
125
+ if line2:
126
+ lines.append(line2.group().strip())
127
+ if line3:
128
+ lines.append(line3.group().strip())
129
+ if line4:
130
+ lines.append(line4.group().strip())
131
+
132
+ return "\n".join(lines)
133
+
134
 
135
  # Function: Save to Excel
136
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):