Guhanselvam commited on
Commit
1c1eb47
·
verified ·
1 Parent(s): 512d90a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -17
app.py CHANGED
@@ -11,7 +11,6 @@ def extract_tables(file_path):
11
  with pdfplumber.open(file_path) as pdf:
12
  for page_number, page in enumerate(pdf.pages, start=1):
13
  extracted_tables = page.extract_tables()
14
-
15
  for table_index, table in enumerate(extracted_tables):
16
  df = pd.DataFrame(table[1:], columns=table[0]) # Use the first row as headers
17
  table_info = {
@@ -23,10 +22,7 @@ def extract_tables(file_path):
23
  return all_tables
24
 
25
  def aggregate_and_save_tables(tables_list):
26
- json_metadata = {
27
- 'tables': [],
28
- 'total_tables': len(tables_list)
29
- }
30
  current_combined_table = None
31
  combined_csv_files = []
32
 
@@ -34,15 +30,17 @@ def aggregate_and_save_tables(tables_list):
34
  page_number = table_info['page_number']
35
  df = table_info['dataframe']
36
 
37
- # Logic to determine whether to continue the previous table or start a new one
38
  if current_combined_table is None:
39
  current_combined_table = df
40
  else:
41
- # Simple heuristic: check if the current table's header matches header of the last table
42
- if (current_combined_table.columns == df.columns).all():
 
 
43
  current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
44
  else:
45
- # Save current_combined_table to a CSV
46
  file_name = f"combined_table_page_{page_number - 1}.csv"
47
  current_combined_table.to_csv(file_name, index=False)
48
  combined_csv_files.append(file_name)
@@ -51,10 +49,10 @@ def aggregate_and_save_tables(tables_list):
51
  'description': f"Combined table up to page {page_number - 1}",
52
  'rows': current_combined_table.shape[0]
53
  })
54
- # Start a new combined table
55
  current_combined_table = df
56
 
57
- # Save the last combined table if exists
58
  if current_combined_table is not None:
59
  file_name = f"combined_table_final.csv"
60
  current_combined_table.to_csv(file_name, index=False)
@@ -77,22 +75,19 @@ st.title("PDF Table Extractor for Combined Tables")
77
 
78
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
79
  if uploaded_file is not None:
80
- # Save the uploaded file temporarily
81
  temp_file_path = "temp_uploaded_file.pdf"
82
 
83
  with open(temp_file_path, "wb") as f:
84
  f.write(uploaded_file.getbuffer())
85
 
86
  try:
87
- # Extract tables and process into combined CSVs and a JSON metadata file
88
  extracted_tables = extract_tables(temp_file_path)
89
- os.remove(temp_file_path) # Clean up the temporary file
90
-
91
  combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
92
 
93
  st.success("Table extraction and aggregation complete.")
94
 
95
- # Provide download links for each generated CSV file
96
  for csv_file in combined_csvs:
97
  st.download_button(
98
  label=f"Download {csv_file}",
@@ -101,7 +96,6 @@ if uploaded_file is not None:
101
  mime='text/csv'
102
  )
103
 
104
- # Provide download link for the JSON metadata file
105
  st.download_button(
106
  label="Download JSON Metadata",
107
  data=open(metadata_file, 'r').read(),
 
11
  with pdfplumber.open(file_path) as pdf:
12
  for page_number, page in enumerate(pdf.pages, start=1):
13
  extracted_tables = page.extract_tables()
 
14
  for table_index, table in enumerate(extracted_tables):
15
  df = pd.DataFrame(table[1:], columns=table[0]) # Use the first row as headers
16
  table_info = {
 
22
  return all_tables
23
 
24
  def aggregate_and_save_tables(tables_list):
25
+ json_metadata = {'tables': [], 'total_tables': len(tables_list)}
 
 
 
26
  current_combined_table = None
27
  combined_csv_files = []
28
 
 
30
  page_number = table_info['page_number']
31
  df = table_info['dataframe']
32
 
33
+ # Check if headers match for logical table continuation
34
  if current_combined_table is None:
35
  current_combined_table = df
36
  else:
37
+ # Ensure headers are aligned in structure
38
+ headers_match = len(current_combined_table.columns) == len(df.columns) and \
39
+ (current_combined_table.columns == df.columns).all()
40
+ if headers_match:
41
  current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
42
  else:
43
+ # Save the current table as CSV
44
  file_name = f"combined_table_page_{page_number - 1}.csv"
45
  current_combined_table.to_csv(file_name, index=False)
46
  combined_csv_files.append(file_name)
 
49
  'description': f"Combined table up to page {page_number - 1}",
50
  'rows': current_combined_table.shape[0]
51
  })
52
+ # Start a new table
53
  current_combined_table = df
54
 
55
+ # Save the last combined table if it exists
56
  if current_combined_table is not None:
57
  file_name = f"combined_table_final.csv"
58
  current_combined_table.to_csv(file_name, index=False)
 
75
 
76
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
77
  if uploaded_file is not None:
 
78
  temp_file_path = "temp_uploaded_file.pdf"
79
 
80
  with open(temp_file_path, "wb") as f:
81
  f.write(uploaded_file.getbuffer())
82
 
83
  try:
 
84
  extracted_tables = extract_tables(temp_file_path)
85
+ os.remove(temp_file_path)
86
+
87
  combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
88
 
89
  st.success("Table extraction and aggregation complete.")
90
 
 
91
  for csv_file in combined_csvs:
92
  st.download_button(
93
  label=f"Download {csv_file}",
 
96
  mime='text/csv'
97
  )
98
 
 
99
  st.download_button(
100
  label="Download JSON Metadata",
101
  data=open(metadata_file, 'r').read(),