neerajkalyank commited on
Commit
e20c41e
·
verified ·
1 Parent(s): 4fcec9d

Update toshiba.py

Browse files
Files changed (1) hide show
  1. toshiba.py +26 -8
toshiba.py CHANGED
@@ -8,39 +8,57 @@ def extract_toshiba_data(pdf_file):
8
  purchase_order, order_date = None, None
9
 
10
  with pdfplumber.open(pdf_file) as pdf:
11
- for page in pdf.pages:
12
- text = page.extract_text().splitlines()
 
 
 
 
 
 
 
 
 
13
 
14
  # Extract Purchase Order and Order Date if not already found
15
  if not purchase_order or not order_date:
16
- for line in text:
17
  po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
18
  date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
19
  if po_match:
20
  purchase_order = po_match.group(1)
 
21
  if date_match:
22
  order_date = date_match.group(1)
 
23
 
24
- # Extract item details using patterns
25
- for line in text:
26
- # Match each line with expected pattern for item rows
27
  item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line)
28
  if item_match:
29
  pos = int(item_match.group(1)) # Position number
30
  item_code = item_match.group(2) # Item Code
31
- item_name = item_match.group(3).strip() # Item Name/Description (if available)
32
  delivery_date = item_match.group(4) # Delivery Date
33
  quantity = float(item_match.group(5)) # Quantity
34
  basic_price = float(item_match.group(6)) # Basic Price
35
  amount = float(item_match.group(7)) # Calculated Amount
36
  sub_total = float(item_match.group(8)) # Subtotal or final price
37
 
 
38
  data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total])
 
39
 
40
- # Define DataFrame with the corrected structure
41
  df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"])
42
 
43
  # Save to Excel file
44
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
45
  df.to_excel(temp_file.name, index=False)
 
46
  return temp_file.name
 
 
 
 
 
 
8
  purchase_order, order_date = None, None
9
 
10
  with pdfplumber.open(pdf_file) as pdf:
11
+ for page_num, page in enumerate(pdf.pages):
12
+ # Extract and print the raw text of each page for debugging
13
+ text = page.extract_text()
14
+ if text:
15
+ print(f"Page {page_num + 1} Content:\n{text}\n{'-' * 40}\n")
16
+ else:
17
+ print(f"Page {page_num + 1} has no extractable text.\n{'-' * 40}\n")
18
+ continue
19
+
20
+ # Split text into lines to analyze line by line
21
+ lines = text.splitlines()
22
 
23
  # Extract Purchase Order and Order Date if not already found
24
  if not purchase_order or not order_date:
25
+ for line in lines:
26
  po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
27
  date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
28
  if po_match:
29
  purchase_order = po_match.group(1)
30
+ print(f"Found Purchase Order: {purchase_order}") # Debug
31
  if date_match:
32
  order_date = date_match.group(1)
33
+ print(f"Found Order Date: {order_date}") # Debug
34
 
35
+ # Attempt to match item details using a general regex pattern
36
+ for line in lines:
 
37
  item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line)
38
  if item_match:
39
  pos = int(item_match.group(1)) # Position number
40
  item_code = item_match.group(2) # Item Code
41
+ item_name = item_match.group(3).strip() # Item Name/Description
42
  delivery_date = item_match.group(4) # Delivery Date
43
  quantity = float(item_match.group(5)) # Quantity
44
  basic_price = float(item_match.group(6)) # Basic Price
45
  amount = float(item_match.group(7)) # Calculated Amount
46
  sub_total = float(item_match.group(8)) # Subtotal or final price
47
 
48
+ # Append the extracted row to data list
49
  data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total])
50
+ print(f"Matched Item Row: {[purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total]}") # Debug
51
 
52
+ # Define DataFrame with the expected structure
53
  df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"])
54
 
55
  # Save to Excel file
56
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
57
  df.to_excel(temp_file.name, index=False)
58
+ print(f"Data extracted to: {temp_file.name}")
59
  return temp_file.name
60
+
61
+ # Usage example with debug output
62
+ file_path = '/mnt/data/Toshiba PO.pdf' # Replace this with the actual file path
63
+ output_file = extract_toshiba_data(file_path)
64
+ print(f"Extracted data saved to: {output_file}")