File size: 4,615 Bytes
d2251a9
 
 
4482309
229b210
d2251a9
 
 
 
71106bd
cf4d471
6e8ab5b
 
 
d2251a9
229b210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e8ab5b
 
 
 
d2251a9
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pdfplumber
import pandas as pd
import gradio as gr
import re
import tempfile

# Define function to extract data
def extract_data(pdf_file):
    data = []
    columns = ["Purchase Order No", "Date", "SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]

    # Set default values
    purchase_order_no = "Not Found"
    purchase_order_date = "Not Found"

    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                
                if not text:
                    continue  # Skip pages without text
                
                lines = text.splitlines()

                # Attempt to dynamically extract Purchase Order No and Date from the first page
                for line in lines:
                    # Search for Purchase Order No
                    po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line, re.IGNORECASE)
                    if po_match:
                        purchase_order_no = po_match.group(1)

                    # Search for Date
                    date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line, re.IGNORECASE)
                    if date_match:
                        purchase_order_date = date_match.group(1)

                    # Stop if both values are found
                    if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
                        break

                # Process lines to extract row data, looking for rows that start with SI No
                for line in lines:
                    try:
                        # Match lines that start with an SI number (e.g., "10", "20")
                        si_no_match = re.match(r'^(\d+)\s', line)
                        if si_no_match:
                            parts = line.split()

                            # Extract SI No
                            si_no = parts[0]

                            # Extract Material Number and format the Material Description
                            material_number = parts[2] if len(parts) > 2 else "Unknown"
                            material_desc = f"BPS 017507\nMaterial Number: {material_number}\nHSN Code: 8310\nIGST: 18%"

                            # Extract Unit, Quantity, Dely Qty, Dely Date, Unit Rate, and Value
                            unit = parts[3] if len(parts) > 3 else "NO"  # Default to "NO" if not found
                            quantity = int(parts[4]) if len(parts) > 4 else 0
                            dely_qty = int(parts[5]) if len(parts) > 5 else 0
                            dely_date = parts[6] if len(parts) > 6 else "Unknown"
                            unit_rate = float(parts[7]) if len(parts) > 7 else 0.0
                            value = float(parts[8]) if len(parts) > 8 else 0.0

                            # Append extracted data in the specified order
                            data.append([
                                purchase_order_no,
                                purchase_order_date,
                                si_no,
                                material_desc,
                                unit,
                                quantity,
                                dely_qty,
                                dely_date,
                                unit_rate,
                                value
                            ])
                    except (ValueError, IndexError) as e:
                        print(f"Error processing line: {line} - {e}")
                        continue  # Skip lines that do not match the expected format

        # Convert data to DataFrame and save as Excel
        df = pd.DataFrame(data, columns=columns)
        
        # Generate a temporary file path for the Excel file
        with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file:
            excel_path = tmp_file.name
            df.to_excel(excel_path, index=False)

    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")
        return None

    # Log warning if data was not found for Purchase Order No or Date
    if purchase_order_no == "Not Found" or purchase_order_date == "Not Found":
        print("Warning: 'Purchase Order No' or 'Date' was not found in the PDF.")
    
    return excel_path

# Set up Gradio interface
iface = gr.Interface(
    fn=extract_data,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Excel"),
    title="PDF Data Extractor"
)

# Launch the app
iface.launch()