Spaces:

blaxx14
/

pdf-parser-api

Sleeping

blaxx14 commited on Apr 12

Commit

fa6a5e5

2 Parent(s): ec46473 15a0b55

merging

Files changed (3) hide show

backend/file_uploads.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 app = Flask(__name__)
-app.config['UPLOAD_FOLDER'] = 'temp'
 app.config['ALLOWED_EXTENSIONS'] = {'pdf', 'png', 'jpg', 'jpeg', 'xlsx', 'csv'}
 app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
@@ -25,10 +25,10 @@ def upload_file():
     if file and allowed_file(file.filename):
         filename = secure_filename(file.filename)
-        if not os.path.exists(app.config['UPLOAD_FOLDER']):
-            os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
-        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
         file.save(filepath)
         if filename.endswith('.pdf'):

 app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = 'tmp'
 app.config['ALLOWED_EXTENSIONS'] = {'pdf', 'png', 'jpg', 'jpeg', 'xlsx', 'csv'}
 app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
     if file and allowed_file(file.filename):
         filename = secure_filename(file.filename)
+        if not os.path.exists('/tmp'):
+            os.makedirs('/tmp', exist_ok=True)
+        filepath = os.path.join('/tmp', filename)
         file.save(filepath)
         if filename.endswith('.pdf'):

backend/parser.py CHANGED Viewed

@@ -117,9 +117,9 @@ def parse_promotion_pdf(pdf_path):
     docx_path = pathname + ".docx"
     with open(pdf_path, 'rb') as f:
-        convert_pdf_to_word(f, os.path.join('temp', docx_path))
-    tables = extract_tables_from_docx(os.path.join('temp', docx_path))
     tables_result = parse_table_data(tables)
     del tables_result[0]
@@ -243,14 +243,20 @@ def parse_promotion_excel(excel_path, filename):
     df = df.where(pd.notnull(df), None)
     data = df.to_dict(orient="records")
-    os.makedirs('temp', exist_ok=True)
     if not filename.lower().endswith('.json'):
         filename += '.json'
-    filepath = os.path.join('temp', filename)
     base_name, ext = os.path.splitext(filename)
     copy_num = 1
     while os.path.exists(filepath):
-        filepath = os.path.join('temp', f"{base_name} ({copy_num}){ext}")
         copy_num += 1
     with open(filepath, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=2)

     docx_path = pathname + ".docx"
     with open(pdf_path, 'rb') as f:
+        convert_pdf_to_word(f, os.path.join('/tmp', docx_path))
+    tables = extract_tables_from_docx(os.path.join('/tmp', docx_path))
     tables_result = parse_table_data(tables)
     del tables_result[0]
     df = df.where(pd.notnull(df), None)
     data = df.to_dict(orient="records")
+    # Buat folder temp jika belum ada
+    os.makedirs('/tmp', exist_ok=True)
+    # Tambah .json jika belum ada
     if not filename.lower().endswith('.json'):
         filename += '.json'
+    # Cegah overwrite file
+    filepath = os.path.join('/tmp', filename)
     base_name, ext = os.path.splitext(filename)
     copy_num = 1
     while os.path.exists(filepath):
+        filepath = os.path.join('/tmp', f"{base_name} ({copy_num}){ext}")
         copy_num += 1
     with open(filepath, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=2)

temp/file.txt ADDED Viewed

File without changes