blaxx14 commited on
Commit
81f6231
·
1 Parent(s): 26d0dba

add all files

Browse files
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN apt-get update && apt-get install -y tesseract-ocr libtesseract-dev
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install -r requirements.txt
7
+
8
+ COPY . /app
9
+ WORKDIR /app
10
+
11
+ ENV PORT 7860
12
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask
2
+ from backend.file_uploads import upload_file
3
+
4
+ app = Flask(__name__)
5
+
6
+ app.add_url_rule('/upload', 'upload_file', upload_file, methods=['POST'])
7
+
8
+ if __name__ == '__main__':
9
+ app.run(debug=True)
backend/file_uploads.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from werkzeug.utils import secure_filename
3
+ from .parser import parse_promotion_pdf, parse_promotion_excel
4
+ from .text_recog import parsing_image
5
+ import os
6
+
7
+ app = Flask(__name__)
8
+
9
+ app.config['UPLOAD_FOLDER'] = 'temp'
10
+ app.config['ALLOWED_EXTENSIONS'] = {'pdf', 'png', 'jpg', 'jpeg', 'xlsx', 'csv'}
11
+ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
12
+
13
+ def allowed_file(filename):
14
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
15
+
16
+ def upload_file():
17
+ if 'file' not in request.files:
18
+ return jsonify({'error': 'No file part'}), 400
19
+
20
+ file = request.files['file']
21
+
22
+ if file.filename == '':
23
+ return jsonify({'error': 'No selected file'}), 400
24
+
25
+ if file and allowed_file(file.filename):
26
+ filename = secure_filename(file.filename)
27
+
28
+ if not os.path.exists(app.config['UPLOAD_FOLDER']):
29
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
30
+
31
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
32
+ file.save(filepath)
33
+
34
+ if filename.endswith('.pdf'):
35
+ parsed_file = parse_promotion_pdf(filepath)
36
+ elif filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
37
+ parsed_file = parsing_image(filepath, filename)
38
+ elif filename.endswith('.xlsx' or '.csv'):
39
+ parsed_file = parse_promotion_excel(filepath, filename)
40
+ else:
41
+ parsed_file = {
42
+ 'filename' : filename
43
+ }
44
+
45
+ # res = convert_to_target_json(parsed_file)
46
+
47
+ # with open("output.json", "w") as f:
48
+ # json.dump(res, f, indent=2)
49
+
50
+ return jsonify({
51
+ 'message': 'File uploaded successfully',
52
+ 'filename': filename,
53
+ 'path': filepath,
54
+ 'content' : parsed_file
55
+ }), 200
56
+
57
+ return jsonify({'error': 'File type not allowed'}), 400
backend/file_utils.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ from pdf2docx import Converter
4
+ from docx import Document
5
+
6
+ def convert_pdf_to_word(pdf_path, docx_path):
7
+ cv = Converter(pdf_path)
8
+ cv.convert(docx_path)
9
+ cv.close()
10
+
11
+ def convert_image_to_word(text, filename):
12
+ copy_num = 0
13
+ doc = Document()
14
+ doc.add_heading(f"{filename}", level=1)
15
+ doc.add_paragraph(text)
16
+ doc.add_paragraph("\n" + "-"*50 + "\n")
17
+
18
+ if not os.path.exists(os.path.join('temp', filename)):
19
+ filepath = os.path.join('temp', filename)
20
+ else:
21
+ copy_num+=1
22
+ filepath = os.path.join('temp', f'{filename}({copy_num})')
23
+
24
+ doc.save(filepath)
25
+
26
+ def wait_for_file_release(file_path, timeout=5):
27
+ start_time = time.time()
28
+ while time.time() - start_time < timeout:
29
+ try:
30
+ with open(file_path, 'rb'):
31
+ return True
32
+ except PermissionError:
33
+ time.sleep(0.5)
34
+ return False
35
+
36
+ def delete_temp_folder(temp_path="./temp"):
37
+ time.sleep(0.5)
38
+ for filename in os.listdir(temp_path):
39
+ file_path = os.path.join(temp_path, filename)
40
+ if wait_for_file_release(file_path):
41
+ try:
42
+ os.remove(file_path)
43
+ print(f"Hapus: {file_path}")
44
+ except Exception as e:
45
+ print(f"Gagal hapus {file_path}: {e}")
46
+ else:
47
+ print(f"File terkunci terlalu lama: {file_path}")
48
+
49
+ def extract_tables_from_docx(docx_path):
50
+ doc = Document(docx_path)
51
+ all_tables = []
52
+
53
+ for table in doc.tables:
54
+ table_data = []
55
+ bold_map = []
56
+
57
+ for row in table.rows:
58
+ row_data = []
59
+ row_bold_flags = []
60
+
61
+ for cell in row.cells:
62
+ texts = []
63
+ is_bold = False
64
+
65
+ for paragraph in cell.paragraphs:
66
+ for run in paragraph.runs:
67
+ texts.append(run.text.strip())
68
+ if run.bold:
69
+ is_bold = True
70
+
71
+ cell_text = " ".join(texts).strip()
72
+ row_data.append(cell_text)
73
+ row_bold_flags.append(is_bold)
74
+
75
+ table_data.append(row_data)
76
+ bold_map.append(row_bold_flags)
77
+
78
+ all_tables.append({
79
+ "table_data": table_data,
80
+ "bold_map": bold_map
81
+ })
82
+
83
+ return all_tables
backend/parser.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz
3
+ import os
4
+ import json
5
+ import pandas as pd
6
+ from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
7
+ from collections import Counter
8
+ from difflib import SequenceMatcher
9
+
10
+ def is_similar_header(h1, h2, threshold=0.8):
11
+ if len(h1) != len(h2):
12
+ return False
13
+ ratio = sum(SequenceMatcher(None, a, b).ratio() for a, b in zip(h1, h2)) / len(h1)
14
+ return ratio > threshold
15
+
16
+ def is_empty_data(rows):
17
+ return all(all(cell.strip() == "" for cell in row) for row in rows)
18
+
19
+ def parse_table_data(raw_tables):
20
+ parsed_tables = []
21
+ last_table = None
22
+ pending_header_rows = []
23
+ pending_bold_maps = []
24
+
25
+ for table_dict in raw_tables:
26
+ table = table_dict["table_data"]
27
+ bold_map = table_dict["bold_map"]
28
+
29
+ if not table or not bold_map or len(table) != len(bold_map):
30
+ continue
31
+
32
+ if pending_header_rows:
33
+ table = pending_header_rows + table
34
+ bold_map = pending_bold_maps + bold_map
35
+ pending_header_rows = []
36
+ pending_bold_maps = []
37
+
38
+ title = Counter(table[0]).most_common(1)[0][0] if table[0] else "UNKNOWN"
39
+ bold_row_indices = [i for i, row in enumerate(bold_map) if any(row)]
40
+
41
+ if not bold_row_indices:
42
+ if title == "NO":
43
+ headers = table[0]
44
+ data_rows = table[1:]
45
+ bold_indices = list(range(len(headers)))
46
+ else:
47
+ if last_table:
48
+ for row in table:
49
+ row_dict = {
50
+ last_table["headers"][i]: row[i] if i < len(row) else "not item"
51
+ for i in range(len(last_table["headers"]))
52
+ }
53
+ not_item_count = sum(1 for v in row_dict.values() if v.strip() == "not item")
54
+ if not_item_count <= 6:
55
+ last_table["rows"].append(row_dict)
56
+ continue
57
+ else:
58
+ if len(bold_row_indices) >= 3:
59
+ header_row_index = bold_row_indices[2]
60
+ elif len(bold_row_indices) == 2:
61
+ header_row_index = bold_row_indices[1]
62
+ else:
63
+ header_row_index = bold_row_indices[0]
64
+
65
+ header_row = table[header_row_index]
66
+ bold_row = bold_map[header_row_index]
67
+ headers = [
68
+ cell.strip() if isinstance(cell, str) else f"COL_{i}"
69
+ for i, (cell, is_bold) in enumerate(zip(header_row, bold_row)) if is_bold
70
+ ]
71
+ bold_indices = [i for i, is_bold in enumerate(bold_row) if is_bold]
72
+ data_rows = table[header_row_index + 1:]
73
+
74
+ if is_empty_data(data_rows):
75
+ if last_table and is_similar_header(headers, last_table["headers"]):
76
+ continue
77
+ else:
78
+ continue
79
+
80
+ rows = []
81
+ for row in data_rows:
82
+ row_dict = {}
83
+ for i, header_index in enumerate(bold_indices):
84
+ header = headers[i] if i < len(headers) else f"COL_{header_index}"
85
+ value = row[header_index] if header_index < len(row) else ""
86
+ row_dict[header] = value
87
+ rows.append(row_dict)
88
+
89
+ parsed = {
90
+ "title": title,
91
+ "headers": headers,
92
+ "rows": rows
93
+ }
94
+
95
+ parsed_tables.append(parsed)
96
+ last_table = parsed
97
+
98
+ return parsed_tables
99
+
100
+ def clean_checkbox_newlines(text):
101
+ pattern = r"([☑☐])\s*\n"
102
+
103
+ cleaned_text = re.sub(pattern, r"\1 ", text)
104
+ return cleaned_text
105
+
106
+ def parse_promotion_pdf(pdf_path):
107
+ doc = fitz.open(pdf_path)
108
+ text = ""
109
+
110
+ for page in doc:
111
+ text += page.get_text()
112
+ text= clean_checkbox_newlines(text)
113
+
114
+ pathname = os.path.splitext(os.path.basename(pdf_path))[0]
115
+
116
+ docx_path = pathname + ".docx"
117
+
118
+ with open(pdf_path, 'rb') as f:
119
+ convert_pdf_to_word(f, os.path.join('temp', docx_path))
120
+
121
+ tables = extract_tables_from_docx(os.path.join('temp', docx_path))
122
+ tables_result = parse_table_data(tables)
123
+ del tables_result[0]
124
+
125
+ result = {
126
+ "header": {},
127
+ "products": [],
128
+ "outlets": [],
129
+ "mechanisms": [],
130
+ "budget": {},
131
+ }
132
+
133
+ header_patterns = {
134
+ "file_number": r"NOMOR:\s*(.+)",
135
+ "product_category": r"PRODUCT CATEGORY\s*:\s*(.+)",
136
+ "brand": r"BRAND\s*:\s*(.+)",
137
+ "channel": r"CHANNEL\s*:\s*(.+)",
138
+ "region" : r"REGION\s*:\s*(.+)",
139
+ "sub_region": r"SUB REGION\s*:\s*(.+)",
140
+ "distributor": r"DISTRIBUTOR\s*:\s*(.+)",
141
+ "promo_type" : r"PROMO TYPE\s*:\s*(.+)",
142
+ "sub_promo_type" : r"SUB PROMO TYPE\s*:\s*(.+)",
143
+ "period": r"PERIODE CP:\s*(\d{2}/\d{2}/\d{4})\s*-\s*(\d{2}/\d{2}/\d{4})",
144
+ "ref_doc" : r"REF DOC\s*:\s*(.+)",
145
+ "ref_cp_no" : r"REF CP NO\s*:\s*(.+)",
146
+ "cost_category": r"COST CATEGORY\s*((?:[��☐][^\n]*\n)+)(?=(?:TIPE CP|$))",
147
+ "tipe_cp": r"TIPE CP\s*((?:[☑☐][^\n]*\n)+)(?=(?:TIPE CLAIM|$))",
148
+ "tipe_claim": r"TIPE CLAIM\s*((?:[☑☐][^\n]*\n)+)(?=(?:CLAIM BASED|$))",
149
+ "claim_based": r"CLAIM BASED\s*((?:[☑☐][^\n]*\n)+)(?=$)"
150
+ }
151
+
152
+ # result["text_table"] = tables_result
153
+
154
+ for field, pattern in header_patterns.items():
155
+ match = re.search(pattern, text)
156
+ if match:
157
+ if field == "period":
158
+ result["header"]["validfrom"] = match.group(1).replace("/", "")
159
+ result["header"]["validto"] = match.group(2).replace("/", "")
160
+ elif field in ["cost_category", "tipe_cp", "tipe_claim", "claim_based"]:
161
+ section_text = match.group(1)
162
+ text = text+section_text
163
+ options = {}
164
+ for opt_match in re.finditer(r"([☑☐])\s*([^\n☑☐]+)", section_text):
165
+ is_checked = opt_match.group(1) == '☑'
166
+ option_name = opt_match.group(2).strip()
167
+ if option_name:
168
+ options[option_name] = is_checked
169
+ result["header"][field] = options
170
+ else:
171
+ result["header"][field] = match.group(1).strip()
172
+
173
+ product_table_start = next((item["rows"] for item in tables_result if item["title"] == "DISCOUNT PROMOTION"), [])
174
+ strata_table_start = next((item["rows"] for item in tables_result if item["title"] == "STRATA DISCOUNT TABLE"), [])
175
+
176
+ if product_table_start and strata_table_start:
177
+ product_lookup = {item["UOM"]: item for item in product_table_start}
178
+
179
+ for feature in strata_table_start:
180
+ uom = feature['UOM']
181
+ product_data = product_lookup.get(uom)
182
+
183
+ if product_data:
184
+ product = {
185
+ "sku": feature['SKU'],
186
+ "uom": uom,
187
+ "price_list": product_data.get('PRICE LIST SATP'),
188
+ "discount_percent": feature.get('DISC %'),
189
+ "rbp_store": product_data.get('RBP STORE'),
190
+ "share_dist": product_data.get('SHARE DIST %'),
191
+ "rbp_net": feature.get('RBP NET INC PPN')
192
+ }
193
+ result["products"].append(product)
194
+
195
+ result["outlets"] = next((item["rows"] for item in tables_result if item["title"] == "NO"), [])
196
+
197
+ mechanism_match = re.search(r"MECHANISM:\s*(.+?)(?=(✔|$))", text, re.DOTALL)
198
+ if mechanism_match:
199
+ mechanisms = [m.strip() for m in mechanism_match.group(1).split("\n") if m.strip()]
200
+ mechanisms_clean = [re.sub(r'\'\d+\.\s*', '', m) for m in mechanisms]
201
+ result["mechanisms"] = mechanisms_clean
202
+
203
+ budget_match = re.search(r"TOTAL EST BUDGET PROMO\s*\|\s*([\d.,]+)", text)
204
+ if budget_match:
205
+ budget = float(budget_match.group(1).replace(".", "").replace(",", "."))
206
+ result["budget"]["total"] = budget
207
+
208
+ delete_temp_folder()
209
+
210
+ return result
211
+
212
+ def parse_promotion_excel(excel_path, filename):
213
+ # Baca file Excel dari baris ke-6 (index 5)
214
+ df = pd.read_excel(excel_path, engine='openpyxl', header=5)
215
+
216
+ # Bersihkan data
217
+ df.dropna(axis=1, how='all', inplace=True)
218
+ df.dropna(axis=0, how='all', inplace=True)
219
+
220
+ # Pastikan semua kolom string dan beri nama untuk kolom tak bernama
221
+ df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
222
+
223
+ # Ganti NaN dengan None
224
+ df = df.where(pd.notnull(df), None)
225
+
226
+ # Konversi ke list of dict
227
+ data = df.to_dict(orient="records")
228
+
229
+ # Buat folder temp jika belum ada
230
+ os.makedirs('temp', exist_ok=True)
231
+
232
+ # Tambah .json jika belum ada
233
+ if not filename.lower().endswith('.json'):
234
+ filename += '.json'
235
+
236
+ # Cegah overwrite file
237
+ filepath = os.path.join('temp', filename)
238
+ base_name, ext = os.path.splitext(filename)
239
+ copy_num = 1
240
+ while os.path.exists(filepath):
241
+ filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
242
+ copy_num += 1
243
+
244
+ # Simpan file JSON
245
+ with open(filepath, "w", encoding="utf-8") as f:
246
+ json.dump(data, f, ensure_ascii=False, indent=2)
247
+
248
+ delete_temp_folder()
249
+
250
+ # Return data
251
+ return data
252
+
253
+ def convert_to_target_json(parsed_data):
254
+ """Convert parsed data to match the target JSON structure"""
255
+ target_json = {
256
+ "m_discountschema_id": 0,
257
+ "ad_org_id": 0,
258
+ "c_doctype_id": 1000134,
259
+ "name": f"{parsed_data['header'].get('brand', '')} PST DEAL KHUSUS",
260
+ "description": f"{parsed_data['header'].get('brand', '')} PST DEAL KHUSUS",
261
+ "discounttype": "B",
262
+ "vendor_id": 1000078,
263
+ "requirementtype": "MS",
264
+ "flatdiscounttype": "P",
265
+ "cumulativelevel": "L",
266
+ "validfrom": parsed_data['header'].get('validfrom', ''),
267
+ "validto": parsed_data['header'].get('validto', ''),
268
+ "selectiontype": "ISC",
269
+ "budgettype": "NB",
270
+ "organizationaleffectiveness": "ISO",
271
+ "qtyallocated": 0,
272
+ "issotrx": "Y",
273
+ "ispickup": "N",
274
+ "fl_isallowmultiplediscount": "N",
275
+ "isincludingsubordinate": "N",
276
+ "isbirthdaydiscount": "N",
277
+ "isactive": "Y",
278
+ "list_org": [{
279
+ "m_discountschema_id": 0,
280
+ "uns_discount_org_id": 0,
281
+ "ad_org_id": 0,
282
+ "ad_orgtrx_id": 1000006,
283
+ "isactive": "Y"
284
+ }],
285
+ "list_customer": [],
286
+ "list_break": []
287
+ }
288
+
289
+ for i, outlet in enumerate(parsed_data['outlets'], start=1):
290
+ target_json["list_customer"].append({
291
+ "m_discountschema_id": 0,
292
+ "uns_discount_customer_id": 0,
293
+ "m_discountschemabreak_id": 0,
294
+ "ad_org_id": 0,
295
+ "c_bpartner_id": 1000000 + i
296
+ })
297
+
298
+ for product in parsed_data['products']:
299
+ target_json["list_break"].append({
300
+ "m_discountschema_id": 0,
301
+ "m_discountschemabreak_id": 0,
302
+ "ad_org_id": 0,
303
+ "seqno": 10,
304
+ "targetbreak": "EP",
305
+ "discounttype": "PVD",
306
+ "breaktype": "M",
307
+ "calculationtype": "Q",
308
+ "name": f"{parsed_data['header'].get('promo_number', '')} {product['sku']}",
309
+ "requirementtype": "MS",
310
+ "productselection": "IOP",
311
+ "c_uom_id": 1000020,
312
+ "m_product_id": 1002979,
313
+ "budgettype": "GB",
314
+ "budgetcalculation": "QTY",
315
+ "qtyallocated": 1000,
316
+ "breakvalue": 0,
317
+ "breakdiscount": 0,
318
+ "isincludingsubordinate": "N",
319
+ "isshareddiscount": "N",
320
+ "isactive": "Y",
321
+ "list_line": [{
322
+ "m_discountschemabreak_id": 0,
323
+ "uns_dsbreakline_id": 0,
324
+ "name": f"{parsed_data['header'].get('promo_number', '')} {product['sku']}",
325
+ "breakvalue": 300,
326
+ "breakvalueto": 1000,
327
+ "qtyallocated": 1000,
328
+ "breakdiscount": product['discount_percent'],
329
+ "isactive": "Y"
330
+ }]
331
+ })
332
+
333
+ return target_json
backend/text_recog.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import pytesseract
3
+ from .file_utils import convert_image_to_word
4
+
5
+ def parsing_image(image, filename):
6
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Users\hp\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
7
+ image = cv2.imread(image)
8
+
9
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
10
+ _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
11
+
12
+ custom_config = r'--oem 3 --psm 6'
13
+ data = pytesseract.image_to_string(thresh, config=custom_config)
14
+ convert_image_to_word(data, filename)
15
+
16
+ return {}
requirements.txt ADDED
Binary file (2.76 kB). View file