Spaces:

blaxx14
/

pdf-parser-api

Sleeping

App Files Files Community

blaxx14 commited on Apr 12

Commit

ec46473

1 Parent(s): 6027c7f

add gitignore and fix excel upload bug

Browse files

Files changed (3) hide show

.gitignore +4 -0
backend/file_uploads.py +0 -5
backend/parser.py +26 -18

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+parser/
+backend/__pycache__/
+temp/

backend/file_uploads.py CHANGED Viewed

@@ -41,11 +41,6 @@ def upload_file():
             parsed_file = {
                 'filename' : filename
             }
-        # res = convert_to_target_json(parsed_file)
-        # with open("output.json", "w") as f:
-        #     json.dump(res, f, indent=2)
         return jsonify({
             'message': 'File uploaded successfully',

             parsed_file = {
                 'filename' : filename
             }
         return jsonify({
             'message': 'File uploaded successfully',

backend/parser.py CHANGED Viewed

@@ -5,6 +5,7 @@ import json
 import pandas as pd
 from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
 from collections import Counter
 from difflib import SequenceMatcher
 def is_similar_header(h1, h2, threshold=0.8):
@@ -210,44 +211,51 @@ def parse_promotion_pdf(pdf_path):
     return result
 def parse_promotion_excel(excel_path, filename):
-    # Baca file Excel dari baris ke-6 (index 5)
-    df = pd.read_excel(excel_path, engine='openpyxl', header=5)
-    # Bersihkan data
     df.dropna(axis=1, how='all', inplace=True)
     df.dropna(axis=0, how='all', inplace=True)
-    # Pastikan semua kolom string dan beri nama untuk kolom tak bernama
     df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
-    # Ganti NaN dengan None
     df = df.where(pd.notnull(df), None)
-    # Konversi ke list of dict
     data = df.to_dict(orient="records")
-    # Buat folder temp jika belum ada
     os.makedirs('temp', exist_ok=True)
-    # Tambah .json jika belum ada
     if not filename.lower().endswith('.json'):
         filename += '.json'
-    # Cegah overwrite file
     filepath = os.path.join('temp', filename)
     base_name, ext = os.path.splitext(filename)
     copy_num = 1
     while os.path.exists(filepath):
         filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
         copy_num += 1
-    # Simpan file JSON
     with open(filepath, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=2)
     delete_temp_folder()
-    # Return data
     return data
 def convert_to_target_json(parsed_data):

 import pandas as pd
 from .file_utils import convert_pdf_to_word, delete_temp_folder, extract_tables_from_docx
 from collections import Counter
+from openpyxl import load_workbook
 from difflib import SequenceMatcher
 def is_similar_header(h1, h2, threshold=0.8):
     return result
 def parse_promotion_excel(excel_path, filename):
+    wb = load_workbook(excel_path)
+    ws = wb.active
+    start_row = None
+    start_col = None
+    for i, row in enumerate(ws.iter_rows(min_row=1, max_row=20), start=1):  # Cek 20 baris pertama
+        for j, cell in enumerate(row, start=1):
+            if cell.value not in [None, ''] and isinstance(cell.value, str):
+                if start_row is None or i < start_row:
+                    start_row = i
+                if start_col is None or j < start_col:
+                    start_col = j
+        if start_row is not None:
+            break
+    if start_row != 1 or start_col != 1:
+        new_ws = wb.create_sheet(title="Normalized")
+        for i, row in enumerate(ws.iter_rows(min_row=start_row, values_only=True), start=1):
+            for j, val in enumerate(row[start_col - 1:], start=1):
+                new_ws.cell(row=i, column=j, value=val)
+        wb.remove(ws)
+        ws = new_ws
+        wb.save(excel_path)
+    df = pd.read_excel(excel_path, engine='openpyxl', header=0)
     df.dropna(axis=1, how='all', inplace=True)
     df.dropna(axis=0, how='all', inplace=True)
     df.columns = [str(col) if not str(col).startswith('Unnamed') else f'Col_{i}' for i, col in enumerate(df.columns)]
     df = df.where(pd.notnull(df), None)
     data = df.to_dict(orient="records")
     os.makedirs('temp', exist_ok=True)
     if not filename.lower().endswith('.json'):
         filename += '.json'
     filepath = os.path.join('temp', filename)
     base_name, ext = os.path.splitext(filename)
     copy_num = 1
     while os.path.exists(filepath):
         filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
         copy_num += 1
     with open(filepath, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=2)
     delete_temp_folder()
     return data
 def convert_to_target_json(parsed_data):