Guhanselvam commited on
Commit
ca2c678
·
verified ·
1 Parent(s): 1c1eb47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -91
app.py CHANGED
@@ -1,107 +1,74 @@
1
  import subprocess
2
  subprocess.check_call(['pip', 'install', 'camelot-py[cv]']) # Install Camelot with OpenCV support
3
- import streamlit as st
4
  import pdfplumber
5
  import pandas as pd
6
  import json
7
- import os
8
 
9
- def extract_tables(file_path):
10
- all_tables = []
11
- with pdfplumber.open(file_path) as pdf:
12
- for page_number, page in enumerate(pdf.pages, start=1):
13
- extracted_tables = page.extract_tables()
14
- for table_index, table in enumerate(extracted_tables):
15
- df = pd.DataFrame(table[1:], columns=table[0]) # Use the first row as headers
16
- table_info = {
17
- 'page_number': page_number,
18
- 'table_index': table_index,
19
- 'dataframe': df
20
- }
21
- all_tables.append(table_info)
22
- return all_tables
23
-
24
- def aggregate_and_save_tables(tables_list):
25
- json_metadata = {'tables': [], 'total_tables': len(tables_list)}
26
- current_combined_table = None
27
- combined_csv_files = []
28
 
29
- for table_info in tables_list:
30
- page_number = table_info['page_number']
31
- df = table_info['dataframe']
 
 
 
 
 
32
 
33
- # Check if headers match for logical table continuation
34
- if current_combined_table is None:
35
- current_combined_table = df
36
- else:
37
- # Ensure headers are aligned in structure
38
- headers_match = len(current_combined_table.columns) == len(df.columns) and \
39
- (current_combined_table.columns == df.columns).all()
40
- if headers_match:
41
- current_combined_table = pd.concat([current_combined_table, df], ignore_index=True)
42
- else:
43
- # Save the current table as CSV
44
- file_name = f"combined_table_page_{page_number - 1}.csv"
45
- current_combined_table.to_csv(file_name, index=False)
46
- combined_csv_files.append(file_name)
47
- json_metadata['tables'].append({
48
- 'csv_file': file_name,
49
- 'description': f"Combined table up to page {page_number - 1}",
50
- 'rows': current_combined_table.shape[0]
51
- })
52
- # Start a new table
53
- current_combined_table = df
54
-
55
- # Save the last combined table if it exists
56
- if current_combined_table is not None:
57
- file_name = f"combined_table_final.csv"
58
- current_combined_table.to_csv(file_name, index=False)
59
- combined_csv_files.append(file_name)
60
- json_metadata['tables'].append({
61
- 'csv_file': file_name,
62
- 'description': "Final combined table",
63
- 'rows': current_combined_table.shape[0]
64
- })
65
 
66
- # Save overall metadata to JSON
67
- json_filename = "tables_metadata.json"
68
- with open(json_filename, 'w') as json_file:
69
- json.dump(json_metadata, json_file, indent=4)
70
 
71
- return combined_csv_files, json_filename
 
 
 
 
 
72
 
73
- # Streamlit Web App
74
- st.title("PDF Table Extractor for Combined Tables")
 
 
75
 
76
- uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
77
- if uploaded_file is not None:
78
- temp_file_path = "temp_uploaded_file.pdf"
 
 
 
 
 
 
 
 
 
 
79
 
80
- with open(temp_file_path, "wb") as f:
81
- f.write(uploaded_file.getbuffer())
82
-
83
- try:
84
- extracted_tables = extract_tables(temp_file_path)
85
- os.remove(temp_file_path)
86
-
87
- combined_csvs, metadata_file = aggregate_and_save_tables(extracted_tables)
88
 
89
- st.success("Table extraction and aggregation complete.")
90
 
91
- for csv_file in combined_csvs:
92
- st.download_button(
93
- label=f"Download {csv_file}",
94
- data=open(csv_file, 'r').read(),
95
- file_name=csv_file,
96
- mime='text/csv'
97
- )
98
-
99
- st.download_button(
100
- label="Download JSON Metadata",
101
- data=open(metadata_file, 'r').read(),
102
- file_name=metadata_file,
103
- mime='application/json'
104
- )
105
 
106
- except Exception as e:
107
- st.error(f"An error occurred during processing: {e}")
 
 
1
  import subprocess
2
  subprocess.check_call(['pip', 'install', 'camelot-py[cv]']) # Install Camelot with OpenCV support
 
3
  import pdfplumber
4
  import pandas as pd
5
  import json
 
6
 
7
+ def extract_tables_from_pdf(pdf_path):
8
+ tables_data = {
9
+ 'table1': [],
10
+ 'table2': None
11
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ with pdfplumber.open(pdf_path) as pdf:
14
+ # Extract pages for Table 1
15
+ for page_number in range(3): # Table 1 spans from page 1 to 3
16
+ page = pdf.pages[page_number]
17
+ extracted_tables = page.extract_tables()
18
+ for table in extracted_tables:
19
+ df = pd.DataFrame(table[1:], columns=table[0]) # Assume the first row as headers
20
+ tables_data['table1'].append(df)
21
 
22
+ # Extract last page for Table 2
23
+ last_page_number = len(pdf.pages) - 1
24
+ last_page = pdf.pages[last_page_number]
25
+ extracted_tables = last_page.extract_tables()
26
+ if extracted_tables:
27
+ tables_data['table2'] = pd.DataFrame(extracted_tables[0][1:], columns=extracted_tables[0][0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ return tables_data
 
 
 
30
 
31
+ def save_data(tables_data):
32
+ # Combine and save Table 1 data
33
+ if tables_data['table1']:
34
+ table1_combined = pd.concat(tables_data['table1'], ignore_index=True)
35
+ table1_filename = 'table1.csv'
36
+ table1_combined.to_csv(table1_filename, index=False)
37
 
38
+ # Save Table 2 data
39
+ table2_filename = 'table2.csv'
40
+ if tables_data['table2'] is not None:
41
+ tables_data['table2'].to_csv(table2_filename, index=False)
42
 
43
+ # Create JSON context
44
+ context = {
45
+ 'table1': {
46
+ 'description': 'Table 1 extracted from pages 1 to 3',
47
+ 'source': table1_filename,
48
+ 'rows': table1_combined.shape[0]
49
+ },
50
+ 'table2': {
51
+ 'description': 'Table 2 extracted from the last page',
52
+ 'source': table2_filename,
53
+ 'rows': tables_data['table2'].shape[0] if tables_data['table2'] is not None else 0
54
+ }
55
+ }
56
 
57
+ context_filename = 'tables_context.json'
58
+ with open(context_filename, 'w') as json_file:
59
+ json.dump(context, json_file, indent=4)
 
 
 
 
 
60
 
61
+ return table1_filename, table2_filename, context_filename
62
 
63
+ # Main function
64
+ def main(pdf_path):
65
+ tables_data = extract_tables_from_pdf(pdf_path)
66
+ table1_filename, table2_filename, context_filename = save_data(tables_data)
67
+
68
+ print(f"CSV file for Table 1 saved as: {table1_filename}")
69
+ print(f"CSV file for Table 2 saved as: {table2_filename}")
70
+ print(f"JSON context saved as: {context_filename}")
 
 
 
 
 
 
71
 
72
+ # Example usage
73
+ pdf_path = "your_pdf_file.pdf" # Replace with your actual PDF file path
74
+ main(pdf_path)