Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

App Files Files Community

neerajkalyank commited on Nov 12

Commit

d97cfeb

•

1 Parent(s): 938ff71

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -63

app.py CHANGED Viewed

@@ -1,80 +1,62 @@
-import gradio as gr
 import pdfplumber
 import pandas as pd
-import re
 from io import BytesIO
-import tempfile
 def extract_data_from_pdf(pdf_file):
-    # Initialize list to hold text from each page
-    text_data = []
-    # Open the PDF file with pdfplumber
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
-            # Extract text from each page
             text = page.extract_text()
-            if text:
-                print(f"Extracted text from page {page.page_number}:\n{text}\n")  # Debugging: Print extracted text
-                text_data.append(text)
-    # Initialize list for parsed data
-    data = []
-    # Define regular expressions for parsing rows
-    row_pattern = re.compile(
-        r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
-    )
-    # Process and structure extracted text
-    for text in text_data:
-        for line in text.split('\n'):
-            # Apply row pattern to each line
-            match = row_pattern.search(line)
-            if match:
-                row = match.groupdict()
-                row["description"] = row["description"].strip()  # Clean description
-                row["quantity"] = float(row["quantity"])
-                row["price"] = float(row["price"])
-                row["discount"] = float(row["discount"])
-                row["amount"] = float(row["amount"])
-                # Append extracted row to data
-                data.append(row)
-    # Create DataFrame if data was extracted
-    if data:
-        df = pd.DataFrame(data)
-        df.columns = [
-            "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
-            "Discount", "Currency", "Amount"
-        ]
-        # Save the DataFrame to a temporary Excel file
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
-        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
-            df.to_excel(writer, index=False, sheet_name="Extracted Data")
-        return temp_file.name
-    else:
-        # If no data was found, create a blank Excel file
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
-        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
-            pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
-        return temp_file.name
-# Define Gradio Interface with updated components
 iface = gr.Interface(
     fn=extract_data_from_pdf,
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Excel"),
-    title="Advanced Document Data Extractor",
-    description=(
-        "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
-        "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
-        "No additional calculations are performed; it simply extracts the data as it appears."
-    ),
 )
 iface.launch()

 import pdfplumber
 import pandas as pd
 from io import BytesIO
+import re
+import gradio as gr
 def extract_data_from_pdf(pdf_file):
+    data = []
+    po_number = None
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
             text = page.extract_text()
+            # Extract PO number once (if not already extracted)
+            if po_number is None:
+                po_match = re.search(r"Purchase Order : (\w+)", text)
+                if po_match:
+                    po_number = po_match.group(1)
+            # Regex pattern to match the row data
+            row_pattern = re.compile(
+                r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+            )
+            # Find all rows matching the pattern
+            for match in row_pattern.finditer(text):
+                pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
+                sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
+                sub_total = sub_total_match.group(1) if sub_total_match else ""
+                data.append({
+                    "Purchase Order": po_number,
+                    "Pos.": pos,
+                    "Item Code": item_code,
+                    "Unit": unit,
+                    "Delivery Date": delivery_date,
+                    "Quantity": quantity,
+                    "Basic Price": basic_price,
+                    "Amount": amount,
+                    "SUB TOTAL": sub_total
+                })
+    # Convert the data to a DataFrame
+    df = pd.DataFrame(data)
+    output = BytesIO()
+    with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
+        df.to_excel(writer, index=False, sheet_name="Extracted Data")
+    output.seek(0)
+    return output
+# Gradio Interface
 iface = gr.Interface(
     fn=extract_data_from_pdf,
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Excel"),
+    title="PDF Data Extractor",
+    description="Extract structured data from a PDF and output it as an Excel file."
 )
 iface.launch()