Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

21b7e40

verified ·

1 Parent(s): 5224ad4

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -55

app.py CHANGED Viewed

@@ -36,39 +36,10 @@ def clean_description(description, item_number=None):
     return description.strip()
-def format_description(description):
-    """
-    Formats the description into multiple lines based on predefined patterns.
-    Args:
-        description (str): Raw description string.
-    Returns:
-        str: Formatted description with line breaks.
-    """
-    # Extract parts of the description based on the expected structure
-    line1_match = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
-    line2_match = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
-    line3_match = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
-    line4_match = re.search(r"With Serial No:.*", description)
-    # Construct the formatted description
-    formatted_description = []
-    if line1_match:
-        formatted_description.append(line1_match.group())
-    if line2_match:
-        formatted_description.append(line2_match.group().strip())
-    if line3_match:
-        formatted_description.append(line3_match.group().strip())
-    if line4_match:
-        formatted_description.append(line4_match.group().strip())
-    # Join the lines with a newline character
-    return "\n".join(formatted_description)
 def parse_po_items_with_filters(text):
     """
     Parses purchase order items from the extracted text using regex with filters.
-    Ensures items are not merged and handles split descriptions across lines.
     Args:
         text (str): Extracted text from the PDF.
     Returns:
@@ -80,16 +51,14 @@ def parse_po_items_with_filters(text):
     description_accumulator = []
     for line in lines:
-        # Match the start of an item row (strict boundary for items)
         item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
         if item_match:
             # Save the previous item
             if current_item:
-                current_item["Description"] = clean_description(
-                    " ".join(description_accumulator).strip(),
-                    item_number=int(current_item["Item"]),
                 )
-                current_item["Description"] = format_description(current_item["Description"])
                 data.append(current_item)
                 description_accumulator = []
@@ -107,7 +76,7 @@ def parse_po_items_with_filters(text):
             # Accumulate additional lines for the current item's description
             description_accumulator.append(line.strip())
-        # Match Qty, Unit, Unit Price, and Total Price
         qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
         if qty_match:
             current_item["Qty"] = qty_match.group("Qty")
@@ -120,28 +89,12 @@ def parse_po_items_with_filters(text):
     # Save the last item
     if current_item:
-        current_item["Description"] = clean_description(
-            " ".join(description_accumulator).strip(),
-            item_number=int(current_item["Item"]),
         )
-        current_item["Description"] = format_description(current_item["Description"])
         data.append(current_item)
-    # Clean specific patterns from item 7
-    for item in data:
-        if item["Item"] == "7":
-            # Remove unwanted text from description
-            item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
-            # Extract and assign unit price and total price if not already extracted
-            if not item["Unit Price"] and not item["Total Price"]:
-                price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
-                if price_match:
-                    item["Unit Price"] = price_match.group("UnitPrice")
-                    item["Total Price"] = price_match.group("TotalPrice")
-                    # Remove extracted price from description
-                    item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
-    # Remove empty descriptions or invalid rows
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
@@ -151,6 +104,33 @@ def parse_po_items_with_filters(text):
     return df, "Data extracted successfully."
 # Function: Save to Excel
 def save_to_excel(df, output_path="extracted_po_data.xlsx"):

     return description.strip()
 def parse_po_items_with_filters(text):
     """
     Parses purchase order items from the extracted text using regex with filters.
+    Ensures items are formatted correctly into rows and columns.
     Args:
         text (str): Extracted text from the PDF.
     Returns:
     description_accumulator = []
     for line in lines:
+        # Match the start of a new item row (e.g., Item No. followed by description)
         item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
         if item_match:
             # Save the previous item
             if current_item:
+                current_item["Description"] = format_description(
+                    " ".join(description_accumulator).strip()
                 )
                 data.append(current_item)
                 description_accumulator = []
             # Accumulate additional lines for the current item's description
             description_accumulator.append(line.strip())
+        # Match Quantity, Unit, Unit Price, and Total Price
         qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
         if qty_match:
             current_item["Qty"] = qty_match.group("Qty")
     # Save the last item
     if current_item:
+        current_item["Description"] = format_description(
+            " ".join(description_accumulator).strip()
         )
         data.append(current_item)
+    # Remove empty rows
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
     return df, "Data extracted successfully."
+def format_description(description):
+    """
+    Formats the description into multiple lines based on patterns.
+    Args:
+        description (str): Raw description text.
+    Returns:
+        str: Formatted description.
+    """
+    # Break the description into multiple lines
+    line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
+    line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
+    line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
+    line4 = re.search(r"With Serial No:.*", description)
+    # Build the formatted description
+    lines = []
+    if line1:
+        lines.append(line1.group().strip())
+    if line2:
+        lines.append(line2.group().strip())
+    if line3:
+        lines.append(line3.group().strip())
+    if line4:
+        lines.append(line4.group().strip())
+    return "\n".join(lines)
 # Function: Save to Excel
 def save_to_excel(df, output_path="extracted_po_data.xlsx"):