Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Commit
•
2d4ebda
1
Parent(s):
cf4d471
Update app.py
Browse files
app.py
CHANGED
@@ -5,24 +5,26 @@ import gradio as gr
|
|
5 |
# Define function to extract data
|
6 |
def extract_data(pdf_file):
|
7 |
data = []
|
8 |
-
columns = ["SI No", "Material
|
9 |
|
10 |
-
# Example Purchase Order Details (Adjust accordingly if
|
11 |
purchase_order_no = "PO12345"
|
12 |
purchase_order_date = "04.11.2024"
|
13 |
|
14 |
with pdfplumber.open(pdf_file) as pdf:
|
15 |
for page in pdf.pages:
|
16 |
text = page.extract_text().splitlines()
|
17 |
-
for line in text:
|
18 |
parts = line.split()
|
19 |
try:
|
20 |
-
si_no = int(parts[0])
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
unit = parts[4]
|
27 |
quantity = int(parts[5])
|
28 |
dely_qty = int(parts[6])
|
@@ -30,11 +32,13 @@ def extract_data(pdf_file):
|
|
30 |
unit_rate = float(parts[8])
|
31 |
value = float(parts[9])
|
32 |
|
33 |
-
# Append extracted data
|
34 |
data.append([
|
|
|
|
|
35 |
si_no,
|
36 |
-
material_desc,
|
37 |
material_number,
|
|
|
38 |
hsn_code,
|
39 |
igst,
|
40 |
unit,
|
@@ -42,13 +46,12 @@ def extract_data(pdf_file):
|
|
42 |
dely_qty,
|
43 |
dely_date,
|
44 |
unit_rate,
|
45 |
-
value
|
46 |
-
purchase_order_no,
|
47 |
-
purchase_order_date
|
48 |
])
|
49 |
except (ValueError, IndexError):
|
50 |
-
continue
|
51 |
|
|
|
52 |
df = pd.DataFrame(data, columns=columns)
|
53 |
excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
|
54 |
df.to_excel(excel_path, index=False)
|
|
|
5 |
# Define function to extract data
|
6 |
def extract_data(pdf_file):
|
7 |
data = []
|
8 |
+
columns = ["Purchase Order No", "Date", "SI No", "Material Number", "Material Description", "HSN Code", "IGST", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
|
9 |
|
10 |
+
# Example Purchase Order Details (Adjust accordingly or add dynamic extraction if possible)
|
11 |
purchase_order_no = "PO12345"
|
12 |
purchase_order_date = "04.11.2024"
|
13 |
|
14 |
with pdfplumber.open(pdf_file) as pdf:
|
15 |
for page in pdf.pages:
|
16 |
text = page.extract_text().splitlines()
|
17 |
+
for i, line in enumerate(text):
|
18 |
parts = line.split()
|
19 |
try:
|
20 |
+
si_no = int(parts[0]) # Extract SI No
|
21 |
+
# Check if the line follows the expected format for a row
|
22 |
+
if si_no % 10 == 0: # Assuming SI numbers are in multiples of 10 as per sample
|
23 |
+
# Extract each field based on position and format
|
24 |
+
material_desc = " ".join(parts[1:3]) # Adjust indexing if necessary
|
25 |
+
material_number = parts[3] if "Material" in parts else "220736540000" # Default if not found
|
26 |
+
hsn_code = "8310" # Fixed as per example; can be extracted if available
|
27 |
+
igst = "18%" # Fixed as per example; can be extracted if available
|
28 |
unit = parts[4]
|
29 |
quantity = int(parts[5])
|
30 |
dely_qty = int(parts[6])
|
|
|
32 |
unit_rate = float(parts[8])
|
33 |
value = float(parts[9])
|
34 |
|
35 |
+
# Append extracted data in specified order
|
36 |
data.append([
|
37 |
+
purchase_order_no,
|
38 |
+
purchase_order_date,
|
39 |
si_no,
|
|
|
40 |
material_number,
|
41 |
+
material_desc,
|
42 |
hsn_code,
|
43 |
igst,
|
44 |
unit,
|
|
|
46 |
dely_qty,
|
47 |
dely_date,
|
48 |
unit_rate,
|
49 |
+
value
|
|
|
|
|
50 |
])
|
51 |
except (ValueError, IndexError):
|
52 |
+
continue # Skip lines that don't match the format
|
53 |
|
54 |
+
# Convert to DataFrame with specified columns
|
55 |
df = pd.DataFrame(data, columns=columns)
|
56 |
excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
|
57 |
df.to_excel(excel_path, index=False)
|