dschandra commited on
Commit
b9789b9
·
verified ·
1 Parent(s): 0331a22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -21
app.py CHANGED
@@ -36,6 +36,35 @@ def clean_description(description, item_number=None):
36
 
37
  return description.strip()
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def parse_po_items_with_filters(text):
40
  """
41
  Parses purchase order items from the extracted text using regex with filters.
@@ -60,6 +89,7 @@ def parse_po_items_with_filters(text):
60
  " ".join(description_accumulator).strip(),
61
  item_number=int(current_item["Item"]),
62
  )
 
63
  data.append(current_item)
64
  description_accumulator = []
65
 
@@ -94,27 +124,9 @@ def parse_po_items_with_filters(text):
94
  " ".join(description_accumulator).strip(),
95
  item_number=int(current_item["Item"]),
96
  )
 
97
  data.append(current_item)
98
 
99
- # Handle item 3 split from item 2
100
- for i, row in enumerate(data):
101
- if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
102
- item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
103
- if item_3_description:
104
- data.insert(
105
- i + 1,
106
- {
107
- "Item": "3",
108
- "Description": item_3_description.group(),
109
- "Qty": "12",
110
- "Unit": "Nos.",
111
- "Unit Price": "3.80",
112
- "Total Price": "45.60",
113
- },
114
- )
115
- # Remove the extracted portion from item 2's description
116
- row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()
117
-
118
  # Clean specific patterns from item 7
119
  for item in data:
120
  if item["Item"] == "7":
@@ -139,8 +151,6 @@ def parse_po_items_with_filters(text):
139
  return df, "Data extracted successfully."
140
 
141
 
142
-
143
-
144
  # Function: Save to Excel
145
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
146
  df.to_excel(output_path, index=False)
 
36
 
37
  return description.strip()
38
 
39
+ def format_description(description):
40
+ """
41
+ Formats the description into multiple lines based on predefined patterns.
42
+ Args:
43
+ description (str): Raw description string.
44
+ Returns:
45
+ str: Formatted description with line breaks.
46
+ """
47
+ # Extract parts of the description based on the expected structure
48
+ line1_match = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
49
+ line2_match = re.search(r"As per Drg\.No\..*?\d+", description)
50
+ line3_match = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
51
+ line4_match = re.search(r"With Serial No:.*", description)
52
+
53
+ # Construct the formatted description
54
+ formatted_description = []
55
+ if line1_match:
56
+ formatted_description.append(line1_match.group())
57
+ if line2_match:
58
+ formatted_description.append(line2_match.group())
59
+ if line3_match:
60
+ formatted_description.append(line3_match.group())
61
+ if line4_match:
62
+ formatted_description.append(line4_match.group())
63
+
64
+ # Join the lines with a newline character
65
+ return "\n".join(formatted_description)
66
+
67
+
68
  def parse_po_items_with_filters(text):
69
  """
70
  Parses purchase order items from the extracted text using regex with filters.
 
89
  " ".join(description_accumulator).strip(),
90
  item_number=int(current_item["Item"]),
91
  )
92
+ current_item["Description"] = format_description(current_item["Description"])
93
  data.append(current_item)
94
  description_accumulator = []
95
 
 
124
  " ".join(description_accumulator).strip(),
125
  item_number=int(current_item["Item"]),
126
  )
127
+ current_item["Description"] = format_description(current_item["Description"])
128
  data.append(current_item)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  # Clean specific patterns from item 7
131
  for item in data:
132
  if item["Item"] == "7":
 
151
  return df, "Data extracted successfully."
152
 
153
 
 
 
154
  # Function: Save to Excel
155
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
156
  df.to_excel(output_path, index=False)