DSatishchandra commited on
Commit
71106bd
1 Parent(s): 8a19d95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -25
app.py CHANGED
@@ -6,41 +6,54 @@ import re
6
  # Define function to extract data
7
  def extract_data(pdf_file):
8
  data = []
9
- columns = ["Purchase Order No", "Date", "SI No", "Material Number", "Material Description", "HSN Code", "IGST", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
10
 
11
- # Example Purchase Order Details (Adjust accordingly)
12
- purchase_order_no = "7200018552"
13
- purchase_order_date = "28.09.2024"
14
 
15
  with pdfplumber.open(pdf_file) as pdf:
16
  for page in pdf.pages:
17
  text = page.extract_text().splitlines()
18
- for i, line in enumerate(text):
19
- parts = line.split()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  try:
21
- si_no = int(parts[0]) # Extract SI No
22
- if si_no % 10 == 0: # Assuming SI numbers are in multiples of 10
23
- # Extracting fields based on pattern and order as per the provided format
24
- material_desc = "BPS 017507" # Based on your example; adjust if dynamic
25
- material_number = parts[3] if "Material" in parts else "220736540000" # Default if not found
26
- hsn_code = "8310" # Fixed HSN Code
27
- igst = "18%" # Fixed IGST
28
- unit = parts[4]
29
- quantity = int(parts[5])
30
- dely_qty = int(parts[6])
31
- dely_date = parts[7]
32
- unit_rate = float(parts[8])
33
- value = float(parts[9])
34
 
35
- # Append extracted data in specified order
36
  data.append([
37
  purchase_order_no,
38
  purchase_order_date,
39
  si_no,
40
- material_number,
41
  material_desc,
42
- hsn_code,
43
- igst,
44
  unit,
45
  quantity,
46
  dely_qty,
@@ -49,9 +62,9 @@ def extract_data(pdf_file):
49
  value
50
  ])
51
  except (ValueError, IndexError):
52
- continue # Skip lines that don't match the format
53
 
54
- # Convert to DataFrame with specified columns
55
  df = pd.DataFrame(data, columns=columns)
56
  excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
57
  df.to_excel(excel_path, index=False)
 
6
  # Define function to extract data
7
  def extract_data(pdf_file):
8
  data = []
9
+ columns = ["Purchase Order No", "Date", "SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
10
 
11
+ purchase_order_no = None
12
+ purchase_order_date = None
 
13
 
14
  with pdfplumber.open(pdf_file) as pdf:
15
  for page in pdf.pages:
16
  text = page.extract_text().splitlines()
17
+
18
+ # Extract Purchase Order No and Date dynamically from the first page
19
+ if not purchase_order_no or not purchase_order_date:
20
+ for line in text:
21
+ # Search for Purchase Order No
22
+ po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line)
23
+ if po_match:
24
+ purchase_order_no = po_match.group(1)
25
+
26
+ # Search for Date
27
+ date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line)
28
+ if date_match:
29
+ purchase_order_date = date_match.group(1)
30
+
31
+ # Stop searching if both fields are found
32
+ if purchase_order_no and purchase_order_date:
33
+ break
34
+
35
+ # Process each line to extract relevant data rows
36
+ for line in text:
37
+ # Using regex or keywords to identify each row
38
  try:
39
+ # Example row pattern match for SI No (Assuming starts with numbers in multiples of 10)
40
+ if re.match(r'^\d+\s', line):
41
+ parts = line.split()
42
+ si_no = parts[0] # Extract SI No
43
+ material_desc = "BPS 017507\nMaterial Number: {}\nHSN Code: 8310\nIGST: 18%".format(parts[2]) # Example Material Description
44
+ unit = "NO"
45
+ quantity = parts[3]
46
+ dely_qty = parts[4]
47
+ dely_date = parts[5]
48
+ unit_rate = parts[6]
49
+ value = parts[7]
 
 
50
 
51
+ # Append data as a row in the correct order
52
  data.append([
53
  purchase_order_no,
54
  purchase_order_date,
55
  si_no,
 
56
  material_desc,
 
 
57
  unit,
58
  quantity,
59
  dely_qty,
 
62
  value
63
  ])
64
  except (ValueError, IndexError):
65
+ continue # Skip lines that do not match the expected pattern
66
 
67
+ # Convert data to a DataFrame and save it as Excel
68
  df = pd.DataFrame(data, columns=columns)
69
  excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
70
  df.to_excel(excel_path, index=False)