neerajkalyank commited on
Commit
ff7685a
1 Parent(s): 7de763b

Create toshiba.py

Browse files
Files changed (1) hide show
  1. toshiba.py +87 -0
toshiba.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import pdfplumber
4
+
5
+ import pandas as pd
6
+
7
+ import re
8
+
9
+ def extract_data(pdf_file):
10
+
11
+ data = []
12
+
13
+ purchase_order, order_date = None, None
14
+
15
+ with pdfplumber.open(pdf_file) as pdf:
16
+
17
+ for page in pdf.pages:
18
+
19
+ text = page.extract_text().splitlines()
20
+
21
+
22
+
23
+
24
+ if not purchase_order or not order_date:
25
+
26
+ for line in text:
27
+
28
+ po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
29
+
30
+ date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
31
+
32
+ if po_match:
33
+
34
+ purchase_order = po_match.group(1)
35
+
36
+ if date_match:
37
+
38
+ order_date = date_match.group(1)
39
+
40
+
41
+
42
+
43
+ for line in text:
44
+
45
+ parts = line.split()
46
+
47
+ try:
48
+
49
+ pos = int(parts[0])
50
+
51
+ if 10 <= pos <= 450:
52
+
53
+ item_code = parts[1]
54
+
55
+ quantity = float(parts[4])
56
+
57
+ basic_price = float(parts[5])
58
+
59
+ sub_total = float(parts[-1])
60
+
61
+ data.append([purchase_order, order_date, pos, item_code, quantity, basic_price, sub_total])
62
+
63
+ except (ValueError, IndexError):
64
+
65
+ continue
66
+
67
+ df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Quantity", "Basic Price", "Sub Total"])
68
+
69
+ excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
70
+
71
+ df.to_excel(excel_path, index=False)
72
+
73
+ return excel_path
74
+
75
+ iface = gr.Interface(
76
+
77
+ fn=extract_data,
78
+
79
+ inputs=gr.File(label="Upload PDF"),
80
+
81
+ outputs=gr.File(label="Download Excel"),
82
+
83
+ title="PDF Data Extractor"
84
+
85
+ )
86
+
87
+ iface.launch()