Update app.py
Browse files
app.py
CHANGED
|
@@ -36,39 +36,10 @@ def clean_description(description, item_number=None):
|
|
| 36 |
|
| 37 |
return description.strip()
|
| 38 |
|
| 39 |
-
def format_description(description):
|
| 40 |
-
"""
|
| 41 |
-
Formats the description into multiple lines based on predefined patterns.
|
| 42 |
-
Args:
|
| 43 |
-
description (str): Raw description string.
|
| 44 |
-
Returns:
|
| 45 |
-
str: Formatted description with line breaks.
|
| 46 |
-
"""
|
| 47 |
-
# Extract parts of the description based on the expected structure
|
| 48 |
-
line1_match = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
|
| 49 |
-
line2_match = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
|
| 50 |
-
line3_match = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
|
| 51 |
-
line4_match = re.search(r"With Serial No:.*", description)
|
| 52 |
-
|
| 53 |
-
# Construct the formatted description
|
| 54 |
-
formatted_description = []
|
| 55 |
-
if line1_match:
|
| 56 |
-
formatted_description.append(line1_match.group())
|
| 57 |
-
if line2_match:
|
| 58 |
-
formatted_description.append(line2_match.group().strip())
|
| 59 |
-
if line3_match:
|
| 60 |
-
formatted_description.append(line3_match.group().strip())
|
| 61 |
-
if line4_match:
|
| 62 |
-
formatted_description.append(line4_match.group().strip())
|
| 63 |
-
|
| 64 |
-
# Join the lines with a newline character
|
| 65 |
-
return "\n".join(formatted_description)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
def parse_po_items_with_filters(text):
|
| 69 |
"""
|
| 70 |
Parses purchase order items from the extracted text using regex with filters.
|
| 71 |
-
Ensures items are
|
| 72 |
Args:
|
| 73 |
text (str): Extracted text from the PDF.
|
| 74 |
Returns:
|
|
@@ -80,16 +51,14 @@ def parse_po_items_with_filters(text):
|
|
| 80 |
description_accumulator = []
|
| 81 |
|
| 82 |
for line in lines:
|
| 83 |
-
# Match the start of
|
| 84 |
item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
|
| 85 |
if item_match:
|
| 86 |
# Save the previous item
|
| 87 |
if current_item:
|
| 88 |
-
current_item["Description"] =
|
| 89 |
-
" ".join(description_accumulator).strip()
|
| 90 |
-
item_number=int(current_item["Item"]),
|
| 91 |
)
|
| 92 |
-
current_item["Description"] = format_description(current_item["Description"])
|
| 93 |
data.append(current_item)
|
| 94 |
description_accumulator = []
|
| 95 |
|
|
@@ -107,7 +76,7 @@ def parse_po_items_with_filters(text):
|
|
| 107 |
# Accumulate additional lines for the current item's description
|
| 108 |
description_accumulator.append(line.strip())
|
| 109 |
|
| 110 |
-
# Match
|
| 111 |
qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
|
| 112 |
if qty_match:
|
| 113 |
current_item["Qty"] = qty_match.group("Qty")
|
|
@@ -120,28 +89,12 @@ def parse_po_items_with_filters(text):
|
|
| 120 |
|
| 121 |
# Save the last item
|
| 122 |
if current_item:
|
| 123 |
-
current_item["Description"] =
|
| 124 |
-
" ".join(description_accumulator).strip()
|
| 125 |
-
item_number=int(current_item["Item"]),
|
| 126 |
)
|
| 127 |
-
current_item["Description"] = format_description(current_item["Description"])
|
| 128 |
data.append(current_item)
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
for item in data:
|
| 132 |
-
if item["Item"] == "7":
|
| 133 |
-
# Remove unwanted text from description
|
| 134 |
-
item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
|
| 135 |
-
# Extract and assign unit price and total price if not already extracted
|
| 136 |
-
if not item["Unit Price"] and not item["Total Price"]:
|
| 137 |
-
price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
|
| 138 |
-
if price_match:
|
| 139 |
-
item["Unit Price"] = price_match.group("UnitPrice")
|
| 140 |
-
item["Total Price"] = price_match.group("TotalPrice")
|
| 141 |
-
# Remove extracted price from description
|
| 142 |
-
item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
|
| 143 |
-
|
| 144 |
-
# Remove empty descriptions or invalid rows
|
| 145 |
data = [row for row in data if row["Description"]]
|
| 146 |
|
| 147 |
# Return data as a DataFrame
|
|
@@ -151,6 +104,33 @@ def parse_po_items_with_filters(text):
|
|
| 151 |
return df, "Data extracted successfully."
|
| 152 |
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Function: Save to Excel
|
| 156 |
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
|
|
|
|
| 36 |
|
| 37 |
return description.strip()
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def parse_po_items_with_filters(text):
|
| 40 |
"""
|
| 41 |
Parses purchase order items from the extracted text using regex with filters.
|
| 42 |
+
Ensures items are formatted correctly into rows and columns.
|
| 43 |
Args:
|
| 44 |
text (str): Extracted text from the PDF.
|
| 45 |
Returns:
|
|
|
|
| 51 |
description_accumulator = []
|
| 52 |
|
| 53 |
for line in lines:
|
| 54 |
+
# Match the start of a new item row (e.g., Item No. followed by description)
|
| 55 |
item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
|
| 56 |
if item_match:
|
| 57 |
# Save the previous item
|
| 58 |
if current_item:
|
| 59 |
+
current_item["Description"] = format_description(
|
| 60 |
+
" ".join(description_accumulator).strip()
|
|
|
|
| 61 |
)
|
|
|
|
| 62 |
data.append(current_item)
|
| 63 |
description_accumulator = []
|
| 64 |
|
|
|
|
| 76 |
# Accumulate additional lines for the current item's description
|
| 77 |
description_accumulator.append(line.strip())
|
| 78 |
|
| 79 |
+
# Match Quantity, Unit, Unit Price, and Total Price
|
| 80 |
qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
|
| 81 |
if qty_match:
|
| 82 |
current_item["Qty"] = qty_match.group("Qty")
|
|
|
|
| 89 |
|
| 90 |
# Save the last item
|
| 91 |
if current_item:
|
| 92 |
+
current_item["Description"] = format_description(
|
| 93 |
+
" ".join(description_accumulator).strip()
|
|
|
|
| 94 |
)
|
|
|
|
| 95 |
data.append(current_item)
|
| 96 |
|
| 97 |
+
# Remove empty rows
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
data = [row for row in data if row["Description"]]
|
| 99 |
|
| 100 |
# Return data as a DataFrame
|
|
|
|
| 104 |
return df, "Data extracted successfully."
|
| 105 |
|
| 106 |
|
| 107 |
+
def format_description(description):
|
| 108 |
+
"""
|
| 109 |
+
Formats the description into multiple lines based on patterns.
|
| 110 |
+
Args:
|
| 111 |
+
description (str): Raw description text.
|
| 112 |
+
Returns:
|
| 113 |
+
str: Formatted description.
|
| 114 |
+
"""
|
| 115 |
+
# Break the description into multiple lines
|
| 116 |
+
line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
|
| 117 |
+
line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
|
| 118 |
+
line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
|
| 119 |
+
line4 = re.search(r"With Serial No:.*", description)
|
| 120 |
+
|
| 121 |
+
# Build the formatted description
|
| 122 |
+
lines = []
|
| 123 |
+
if line1:
|
| 124 |
+
lines.append(line1.group().strip())
|
| 125 |
+
if line2:
|
| 126 |
+
lines.append(line2.group().strip())
|
| 127 |
+
if line3:
|
| 128 |
+
lines.append(line3.group().strip())
|
| 129 |
+
if line4:
|
| 130 |
+
lines.append(line4.group().strip())
|
| 131 |
+
|
| 132 |
+
return "\n".join(lines)
|
| 133 |
+
|
| 134 |
|
| 135 |
# Function: Save to Excel
|
| 136 |
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
|