blaxx14 commited on
Commit
ec46473
·
1 Parent(s): 6027c7f

add gitignore and fix excel upload bug

Browse files
Files changed (3) hide show
  1. .gitignore +4 -0
  2. backend/file_uploads.py +0 -5
  3. backend/parser.py +26 -18
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ parser/
2
+ backend/__pycache__/
3
+ temp/
4
+
backend/file_uploads.py CHANGED
@@ -41,11 +41,6 @@ def upload_file():
41
  parsed_file = {
42
  'filename' : filename
43
  }
44
-
45
- # res = convert_to_target_json(parsed_file)
46
-
47
- # with open("output.json", "w") as f:
48
- # json.dump(res, f, indent=2)
49
 
50
  return jsonify({
51
  'message': 'File uploaded successfully',
 
41
  parsed_file = {
42
  'filename' : filename
43
  }
 
 
 
 
 
44
 
45
  return jsonify({
46
  'message': 'File uploaded successfully',
backend/parser.py CHANGED
@@ -5,6 +5,7 @@ import json
5
  import pandas as pd
6
  from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
7
  from collections import Counter
 
8
  from difflib import SequenceMatcher
9
 
10
  def is_similar_header(h1, h2, threshold=0.8):
@@ -210,44 +211,51 @@ def parse_promotion_pdf(pdf_path):
210
  return result
211
 
212
  def parse_promotion_excel(excel_path, filename):
213
- # Baca file Excel dari baris ke-6 (index 5)
214
- df = pd.read_excel(excel_path, engine='openpyxl', header=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
- # Bersihkan data
217
  df.dropna(axis=1, how='all', inplace=True)
218
  df.dropna(axis=0, how='all', inplace=True)
219
-
220
- # Pastikan semua kolom string dan beri nama untuk kolom tak bernama
221
  df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
222
-
223
- # Ganti NaN dengan None
224
  df = df.where(pd.notnull(df), None)
225
 
226
- # Konversi ke list of dict
227
  data = df.to_dict(orient="records")
228
-
229
- # Buat folder temp jika belum ada
230
  os.makedirs('temp', exist_ok=True)
231
-
232
- # Tambah .json jika belum ada
233
  if not filename.lower().endswith('.json'):
234
  filename += '.json'
235
-
236
- # Cegah overwrite file
237
  filepath = os.path.join('temp', filename)
238
  base_name, ext = os.path.splitext(filename)
239
  copy_num = 1
240
  while os.path.exists(filepath):
241
  filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
242
  copy_num += 1
243
-
244
- # Simpan file JSON
245
  with open(filepath, "w", encoding="utf-8") as f:
246
  json.dump(data, f, ensure_ascii=False, indent=2)
247
 
248
  delete_temp_folder()
249
-
250
- # Return data
251
  return data
252
 
253
  def convert_to_target_json(parsed_data):
 
5
  import pandas as pd
6
  from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
7
  from collections import Counter
8
+ from openpyxl import load_workbook
9
  from difflib import SequenceMatcher
10
 
11
  def is_similar_header(h1, h2, threshold=0.8):
 
211
  return result
212
 
213
  def parse_promotion_excel(excel_path, filename):
214
+ wb = load_workbook(excel_path)
215
+ ws = wb.active
216
+
217
+ start_row = None
218
+ start_col = None
219
+ for i, row in enumerate(ws.iter_rows(min_row=1, max_row=20), start=1): # Cek 20 baris pertama
220
+ for j, cell in enumerate(row, start=1):
221
+ if cell.value not in [None, ''] and isinstance(cell.value, str):
222
+ if start_row is None or i < start_row:
223
+ start_row = i
224
+ if start_col is None or j < start_col:
225
+ start_col = j
226
+ if start_row is not None:
227
+ break
228
+
229
+ if start_row != 1 or start_col != 1:
230
+ new_ws = wb.create_sheet(title="Normalized")
231
+ for i, row in enumerate(ws.iter_rows(min_row=start_row, values_only=True), start=1):
232
+ for j, val in enumerate(row[start_col - 1:], start=1):
233
+ new_ws.cell(row=i, column=j, value=val)
234
+ wb.remove(ws)
235
+ ws = new_ws
236
+ wb.save(excel_path)
237
+
238
+ df = pd.read_excel(excel_path, engine='openpyxl', header=0)
239
 
 
240
  df.dropna(axis=1, how='all', inplace=True)
241
  df.dropna(axis=0, how='all', inplace=True)
 
 
242
  df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
 
 
243
  df = df.where(pd.notnull(df), None)
244
 
 
245
  data = df.to_dict(orient="records")
 
 
246
  os.makedirs('temp', exist_ok=True)
 
 
247
  if not filename.lower().endswith('.json'):
248
  filename += '.json'
 
 
249
  filepath = os.path.join('temp', filename)
250
  base_name, ext = os.path.splitext(filename)
251
  copy_num = 1
252
  while os.path.exists(filepath):
253
  filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
254
  copy_num += 1
 
 
255
  with open(filepath, "w", encoding="utf-8") as f:
256
  json.dump(data, f, ensure_ascii=False, indent=2)
257
 
258
  delete_temp_folder()
 
 
259
  return data
260
 
261
  def convert_to_target_json(parsed_data):