Spaces:
Sleeping
Sleeping
add gitignore and fix excel upload bug
Browse files- .gitignore +4 -0
- backend/file_uploads.py +0 -5
- backend/parser.py +26 -18
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
parser/
|
2 |
+
backend/__pycache__/
|
3 |
+
temp/
|
4 |
+
|
backend/file_uploads.py
CHANGED
@@ -41,11 +41,6 @@ def upload_file():
|
|
41 |
parsed_file = {
|
42 |
'filename' : filename
|
43 |
}
|
44 |
-
|
45 |
-
# res = convert_to_target_json(parsed_file)
|
46 |
-
|
47 |
-
# with open("output.json", "w") as f:
|
48 |
-
# json.dump(res, f, indent=2)
|
49 |
|
50 |
return jsonify({
|
51 |
'message': 'File uploaded successfully',
|
|
|
41 |
parsed_file = {
|
42 |
'filename' : filename
|
43 |
}
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
return jsonify({
|
46 |
'message': 'File uploaded successfully',
|
backend/parser.py
CHANGED
@@ -5,6 +5,7 @@ import json
|
|
5 |
import pandas as pd
|
6 |
from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
|
7 |
from collections import Counter
|
|
|
8 |
from difflib import SequenceMatcher
|
9 |
|
10 |
def is_similar_header(h1, h2, threshold=0.8):
|
@@ -210,44 +211,51 @@ def parse_promotion_pdf(pdf_path):
|
|
210 |
return result
|
211 |
|
212 |
def parse_promotion_excel(excel_path, filename):
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
-
# Bersihkan data
|
217 |
df.dropna(axis=1, how='all', inplace=True)
|
218 |
df.dropna(axis=0, how='all', inplace=True)
|
219 |
-
|
220 |
-
# Pastikan semua kolom string dan beri nama untuk kolom tak bernama
|
221 |
df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
|
222 |
-
|
223 |
-
# Ganti NaN dengan None
|
224 |
df = df.where(pd.notnull(df), None)
|
225 |
|
226 |
-
# Konversi ke list of dict
|
227 |
data = df.to_dict(orient="records")
|
228 |
-
|
229 |
-
# Buat folder temp jika belum ada
|
230 |
os.makedirs('temp', exist_ok=True)
|
231 |
-
|
232 |
-
# Tambah .json jika belum ada
|
233 |
if not filename.lower().endswith('.json'):
|
234 |
filename += '.json'
|
235 |
-
|
236 |
-
# Cegah overwrite file
|
237 |
filepath = os.path.join('temp', filename)
|
238 |
base_name, ext = os.path.splitext(filename)
|
239 |
copy_num = 1
|
240 |
while os.path.exists(filepath):
|
241 |
filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
|
242 |
copy_num += 1
|
243 |
-
|
244 |
-
# Simpan file JSON
|
245 |
with open(filepath, "w", encoding="utf-8") as f:
|
246 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
247 |
|
248 |
delete_temp_folder()
|
249 |
-
|
250 |
-
# Return data
|
251 |
return data
|
252 |
|
253 |
def convert_to_target_json(parsed_data):
|
|
|
5 |
import pandas as pd
|
6 |
from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
|
7 |
from collections import Counter
|
8 |
+
from openpyxl import load_workbook
|
9 |
from difflib import SequenceMatcher
|
10 |
|
11 |
def is_similar_header(h1, h2, threshold=0.8):
|
|
|
211 |
return result
|
212 |
|
213 |
def parse_promotion_excel(excel_path, filename):
|
214 |
+
wb = load_workbook(excel_path)
|
215 |
+
ws = wb.active
|
216 |
+
|
217 |
+
start_row = None
|
218 |
+
start_col = None
|
219 |
+
for i, row in enumerate(ws.iter_rows(min_row=1, max_row=20), start=1): # Cek 20 baris pertama
|
220 |
+
for j, cell in enumerate(row, start=1):
|
221 |
+
if cell.value not in [None, ''] and isinstance(cell.value, str):
|
222 |
+
if start_row is None or i < start_row:
|
223 |
+
start_row = i
|
224 |
+
if start_col is None or j < start_col:
|
225 |
+
start_col = j
|
226 |
+
if start_row is not None:
|
227 |
+
break
|
228 |
+
|
229 |
+
if start_row != 1 or start_col != 1:
|
230 |
+
new_ws = wb.create_sheet(title="Normalized")
|
231 |
+
for i, row in enumerate(ws.iter_rows(min_row=start_row, values_only=True), start=1):
|
232 |
+
for j, val in enumerate(row[start_col - 1:], start=1):
|
233 |
+
new_ws.cell(row=i, column=j, value=val)
|
234 |
+
wb.remove(ws)
|
235 |
+
ws = new_ws
|
236 |
+
wb.save(excel_path)
|
237 |
+
|
238 |
+
df = pd.read_excel(excel_path, engine='openpyxl', header=0)
|
239 |
|
|
|
240 |
df.dropna(axis=1, how='all', inplace=True)
|
241 |
df.dropna(axis=0, how='all', inplace=True)
|
|
|
|
|
242 |
df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
|
|
|
|
|
243 |
df = df.where(pd.notnull(df), None)
|
244 |
|
|
|
245 |
data = df.to_dict(orient="records")
|
|
|
|
|
246 |
os.makedirs('temp', exist_ok=True)
|
|
|
|
|
247 |
if not filename.lower().endswith('.json'):
|
248 |
filename += '.json'
|
|
|
|
|
249 |
filepath = os.path.join('temp', filename)
|
250 |
base_name, ext = os.path.splitext(filename)
|
251 |
copy_num = 1
|
252 |
while os.path.exists(filepath):
|
253 |
filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
|
254 |
copy_num += 1
|
|
|
|
|
255 |
with open(filepath, "w", encoding="utf-8") as f:
|
256 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
257 |
|
258 |
delete_temp_folder()
|
|
|
|
|
259 |
return data
|
260 |
|
261 |
def convert_to_target_json(parsed_data):
|