Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

App Files Files Community

neerajkalyank commited on Nov 12

Commit

359e981

•

1 Parent(s): 7ebbb35

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -27

app.py CHANGED Viewed

@@ -7,50 +7,60 @@ def extract_data_from_pdf(pdf_file):
     data = []
     po_number = None
-    # Open PDF file directly
     with pdfplumber.open(pdf_file.name) as pdf:
         for page in pdf.pages:
             text = page.extract_text()
-            # Extract PO number (only once at the start)
             if po_number is None:
                 po_match = re.search(r"Purchase Order : (\w+)", text)
                 po_number = po_match.group(1) if po_match else "N/A"
-            # Regex pattern for extracting rows
             row_pattern = re.compile(
-                r"(\d+)\s+(\d{9})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)"
             )
-            # Extract each row using the pattern
             for match in row_pattern.finditer(text):
-                pos, item_code, unit, delivery_date, quantity, basic_price, discount, amount = match.groups()
-                # Extract subtotal if present
                 sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
                 sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
-                # Append data for each matched row
-                data.append({
-                    "Purchase Order": po_number,
-                    "Pos.": pos,
-                    "Item Code": item_code,
-                    "Unit": unit,
-                    "Delivery Date": delivery_date,
-                    "Quantity": quantity,
-                    "Basic Price": basic_price,
-                    "Discount": discount,
-                    "Amount": amount,
-                    "SUB TOTAL": sub_total,
-                })
-    # Convert data to DataFrame and save to Excel
     df = pd.DataFrame(data)
-    output_file = "output.xlsx"
-    df.to_excel(output_file, index=False)
-    return output_file
-# Gradio Interface
 iface = gr.Interface(
     fn=extract_data_from_pdf,
     inputs=gr.File(label="Upload PDF"),
@@ -58,4 +68,4 @@ iface = gr.Interface(
     title="PDF Data Extractor",
     description="Extract structured data from a PDF and output it as an Excel file.",
 )
-iface.launch()

     data = []
     po_number = None
     with pdfplumber.open(pdf_file.name) as pdf:
         for page in pdf.pages:
             text = page.extract_text()
+            # Extract PO number
             if po_number is None:
                 po_match = re.search(r"Purchase Order : (\w+)", text)
                 po_number = po_match.group(1) if po_match else "N/A"
+            # Regex pattern for row data
             row_pattern = re.compile(
+                r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
             )
+            # Extract matching rows
             for match in row_pattern.finditer(text):
+                (
+                    pos,
+                    item_code,
+                    unit,
+                    delivery_date,
+                    quantity,
+                    basic_price,
+                    amount,
+                ) = match.groups()
                 sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
                 sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
+                data.append(
+                    {
+                        "Purchase Order": po_number,
+                        "Pos.": pos,
+                        "Item Code": item_code,
+                        "Unit": unit,
+                        "Delivery Date": delivery_date,
+                        "Quantity": quantity,
+                        "Basic Price": basic_price,
+                        "Amount": amount,
+                        "SUB TOTAL": sub_total,
+                    }
+                )
+    # Convert data to DataFrame
     df = pd.DataFrame(data)
+    # Print extracted data (debugging)
+    print(df)
+    # Save to Excel
+    df.to_excel("output.xlsx", index=False)
+    return "output.xlsx"
 iface = gr.Interface(
     fn=extract_data_from_pdf,
     inputs=gr.File(label="Upload PDF"),
     title="PDF Data Extractor",
     description="Extract structured data from a PDF and output it as an Excel file.",
 )
+iface.launch()