Guhanselvam commited on
Commit
712a7bd
·
verified ·
1 Parent(s): 2b00e5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -7
app.py CHANGED
@@ -20,16 +20,28 @@ def extract_tables_with_pdfplumber(file):
20
  return tables
21
 
22
  def process_tables(tables):
23
- # Extract the first three tables assumed to be part of Table 1
24
- table1_data = [pd.DataFrame(t['data'][1:], columns=t['data'][0]) for t in tables[:3]]
 
 
 
 
 
 
 
 
 
 
 
 
25
  table1_df = pd.concat(table1_data, ignore_index=True)
26
  table1_filename = "table1.csv"
27
  table1_df.to_csv(table1_filename, index=False)
28
 
29
- # Extract the last table assumed to be Table 2
30
- table2_data = pd.DataFrame(tables[-1]['data'][1:], columns=tables[-1]['data'][0])
31
  table2_filename = "table2.csv"
32
- table2_data.to_csv(table2_filename, index=False)
33
 
34
  # Prepare context JSON with detailed context
35
  context = {
@@ -45,8 +57,8 @@ def process_tables(tables):
45
  'description': 'Table 2 extracted from the last page',
46
  'pages': tables[-1]['page_number'],
47
  'csv_path': table2_filename,
48
- 'rows': table2_data.shape[0],
49
- 'columns': table2_data.shape[1]
50
  }
51
  }
52
 
 
20
  return tables
21
 
22
  def process_tables(tables):
23
+ def create_dataframe(table_data):
24
+ headers = table_data[0]
25
+ # Make headers unique if they are not
26
+ unique_headers = []
27
+ for header in headers:
28
+ if header in unique_headers:
29
+ new_header = f"{header}_{unique_headers.count(header)}"
30
+ unique_headers.append(new_header)
31
+ else:
32
+ unique_headers.append(header)
33
+ return pd.DataFrame(table_data[1:], columns=unique_headers)
34
+
35
+ # Extract and process the tables into DataFrames
36
+ table1_data = [create_dataframe(t['data']) for t in tables[:3]]
37
  table1_df = pd.concat(table1_data, ignore_index=True)
38
  table1_filename = "table1.csv"
39
  table1_df.to_csv(table1_filename, index=False)
40
 
41
+ # Extract and process the last table assumed to be Table 2
42
+ table2_df = create_dataframe(tables[-1]['data'])
43
  table2_filename = "table2.csv"
44
+ table2_df.to_csv(table2_filename, index=False)
45
 
46
  # Prepare context JSON with detailed context
47
  context = {
 
57
  'description': 'Table 2 extracted from the last page',
58
  'pages': tables[-1]['page_number'],
59
  'csv_path': table2_filename,
60
+ 'rows': table2_df.shape[0],
61
+ 'columns': table2_df.shape[1]
62
  }
63
  }
64