neerajkalyank commited on
Commit
ab51c22
1 Parent(s): 4207427

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -30
app.py CHANGED
@@ -1,45 +1,111 @@
 
1
  import pandas as pd
2
  import fitz # PyMuPDF for PDF reading
3
  import pytesseract # Tesseract for OCR
 
4
  import re
 
 
 
 
5
 
6
  def ai_parse_pdf(pdf_file, language="eng"):
7
  try:
8
- # Open the PDF
9
- if hasattr(pdf_file, 'read'):
10
  pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
11
- else:
12
  pdf_document = fitz.open(pdf_file)
 
 
13
 
14
- # Define expected columns (can be flexible depending on the detected format)
15
- expected_columns = ["Purchase Order", "Pos.", "Item Code", "Unit", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"]
16
- all_data = []
17
- detected_columns = []
18
-
19
- # Process each page
20
- for page_num in range(pdf_document.page_count):
21
- page = pdf_document.load_page(page_num)
22
- text = page.get_text("text")
23
-
24
- # Detect column headers using keywords or patterns
25
- headers = re.findall(r"(Purchase Order|Pos.|Item Code|Unit|Delivery Date|Quantity|Basic Price|Amount|SUB TOTAL)", text)
26
- detected_columns = headers if headers else expected_columns # Fallback if no headers detected
27
 
28
- # Extract data rows based on known patterns or structure (example pattern here)
29
- data_rows = re.findall(r"(your row matching pattern here)", text)
 
 
 
 
 
30
 
31
- # Append parsed data to all_data
32
- for row in data_rows:
33
- row_data = row.split() # Adjust split method as per data format
34
- all_data.append(row_data)
 
 
 
 
35
 
36
- # Create DataFrame with dynamically detected columns
37
- df = pd.DataFrame(all_data, columns=detected_columns)
38
-
39
- # Reorder to expected format, filling missing columns if necessary
40
- df = df.reindex(columns=expected_columns, fill_value="")
41
 
42
- return df, pdf_document.metadata
 
 
 
 
 
 
43
  except Exception as e:
44
- print("Error in ai_parse_pdf:", e)
45
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import pandas as pd
3
  import fitz # PyMuPDF for PDF reading
4
  import pytesseract # Tesseract for OCR
5
+ from io import BytesIO
6
  import re
7
+ from tqdm import tqdm # For progress indicators
8
+
9
+ # Set Tesseract path if necessary
10
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract' # Update this path as necessary
11
 
12
  def ai_parse_pdf(pdf_file, language="eng"):
13
  try:
14
+ # Check if pdf_file is a file-like object or path
15
+ if hasattr(pdf_file, 'read'): # If it's a file-like object (e.g., BytesIO)
16
  pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
17
+ elif isinstance(pdf_file, str): # If it's a path in string format
18
  pdf_document = fitz.open(pdf_file)
19
+ else:
20
+ raise ValueError("Unsupported file type. Please upload a valid PDF file.")
21
 
22
+ # Retrieve metadata
23
+ metadata = pdf_document.metadata or {}
24
+ print("Retrieved Metadata:", metadata) # Debugging step
25
+
26
+ # Prepare DataFrame with dummy data to simplify debugging
27
+ data = [{"Column1": "Test1", "Column2": "Test2"}] # Mock data for debugging
28
+ df = pd.DataFrame(data)
29
+ return df, metadata
30
+
31
+ except Exception as e:
32
+ print("Error in ai_parse_pdf:", e) # Debugging output
33
+ return None, None
 
34
 
35
+ def export_to_excel(df, metadata):
36
+ try:
37
+ output = BytesIO()
38
+ with pd.ExcelWriter(output, engine='openpyxl') as writer:
39
+ # Check if df is valid
40
+ if df is None or df.empty:
41
+ raise ValueError("DataFrame is empty or invalid.")
42
 
43
+ df.to_excel(writer, index=False, sheet_name="Extracted Data")
44
+
45
+ # Check metadata and write it only if it's a valid dictionary
46
+ if metadata and isinstance(metadata, dict):
47
+ metadata_df = pd.DataFrame.from_dict(metadata, orient="index", columns=["Value"])
48
+ metadata_df.to_excel(writer, sheet_name="Metadata")
49
+ else:
50
+ print("Metadata is empty or invalid, skipping metadata export.")
51
 
52
+ output.seek(0)
53
+ return output
54
+ except Exception as e:
55
+ print("Error in export_to_excel:", e) # Debugging output
56
+ return None
57
 
58
+ # Gradio function to extract, display, and cleanse data
59
+ def extract_and_display_data(pdf_file, language):
60
+ try:
61
+ df, metadata = ai_parse_pdf(pdf_file, language)
62
+ if df is None or metadata is None:
63
+ return pd.DataFrame(), {"error": "Failed to parse PDF or retrieve metadata."}
64
+ return df, metadata
65
  except Exception as e:
66
+ print("Error in extract_and_display_data:", e)
67
+ return pd.DataFrame(), {"error": str(e)}
68
+
69
+ # Function to confirm and export data to Excel
70
+ def confirm_and_export_data(df, metadata):
71
+ try:
72
+ excel_file = export_to_excel(df, metadata)
73
+ if excel_file is None:
74
+ return {"error": "Failed to create Excel file"}
75
+ return excel_file
76
+ except Exception as e:
77
+ print("Error in confirm_and_export_data:", e) # Debugging output
78
+ return {"error": str(e)}
79
+
80
+ # Gradio Interface setup
81
+ with gr.Blocks() as iface:
82
+ gr.Markdown("### Simplified PDF to Excel Data Extractor for Debugging")
83
+
84
+ # Inputs for PDF and OCR language selection
85
+ pdf_file = gr.File(label="Upload PDF")
86
+ language = gr.Dropdown(choices=["eng", "spa", "fra"], value="eng", label="OCR Language (if applicable)")
87
+
88
+ # Display extracted data and metadata
89
+ data_display = gr.DataFrame(label="Extracted Data Preview", interactive=True)
90
+ metadata_display = gr.JSON(label="PDF Metadata")
91
+
92
+ # Buttons for actions
93
+ with gr.Row():
94
+ extract_btn = gr.Button("Extract and Display Data")
95
+ export_btn = gr.Button("Confirm and Export to Excel")
96
+
97
+ # Define extract action
98
+ extract_btn.click(
99
+ fn=extract_and_display_data,
100
+ inputs=[pdf_file, language],
101
+ outputs=[data_display, metadata_display]
102
+ )
103
+
104
+ # Define export action
105
+ export_btn.click(
106
+ fn=confirm_and_export_data,
107
+ inputs=[data_display, metadata_display],
108
+ outputs=gr.File(label="Download Excel")
109
+ )
110
+
111
+ iface.launch(share=True) # Set share=True to create a public link