Guhanselvam commited on
Commit
2b00e5d
·
verified ·
1 Parent(s): 70b87d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -57
app.py CHANGED
@@ -4,86 +4,92 @@ import streamlit as st
4
  import pdfplumber
5
  import pandas as pd
6
  import json
7
- import os
8
 
9
- def extract_tables_from_pdf(file_stream):
10
- tables_data = {
11
- 'table1': [],
12
- 'table2': None
13
- }
14
-
15
- with pdfplumber.open(file_stream) as pdf:
16
- # Extract pages for Table 1
17
- for page_number in range(3): # Table 1 spans from page 1 to 3
18
- page = pdf.pages[page_number]
19
  extracted_tables = page.extract_tables()
20
  for table in extracted_tables:
21
- df = pd.DataFrame(table[1:], columns=table[0]) # Assume the first row as headers
22
- tables_data['table1'].append(df)
23
-
24
- # Extract last page for Table 2
25
- last_page_number = len(pdf.pages) - 1
26
- last_page = pdf.pages[last_page_number]
27
- extracted_tables = last_page.extract_tables()
28
- if extracted_tables:
29
- tables_data['table2'] = pd.DataFrame(extracted_tables[0][1:], columns=extracted_tables[0][0])
30
-
31
- return tables_data
32
 
33
- def save_data(tables_data):
34
- current_dir = os.getcwd()
35
-
36
- # Combine and save Table 1 data
37
- table1_filename = os.path.join(current_dir, 'table1.csv')
38
- if tables_data['table1']:
39
- table1_combined = pd.concat(tables_data['table1'], ignore_index=True)
40
- table1_combined.to_csv(table1_filename, index=False)
41
 
42
- # Save Table 2 data
43
- table2_filename = os.path.join(current_dir, 'table2.csv')
44
- if tables_data['table2'] is not None:
45
- tables_data['table2'].to_csv(table2_filename, index=False)
46
 
47
- # Create JSON context
48
  context = {
 
49
  'table1': {
50
- 'description': 'Table 1 extracted from pages 1 to 3',
51
- 'source': table1_filename,
52
- 'rows': table1_combined.shape[0] if tables_data['table1'] else 0
 
 
53
  },
54
  'table2': {
55
  'description': 'Table 2 extracted from the last page',
56
- 'source': table2_filename,
57
- 'rows': tables_data['table2'].shape[0] if tables_data['table2'] is not None else 0
 
 
58
  }
59
  }
60
-
61
- context_filename = os.path.join(current_dir, 'tables_context.json')
62
  with open(context_filename, 'w') as json_file:
63
  json.dump(context, json_file, indent=4)
64
 
65
  return table1_filename, table2_filename, context_filename
66
 
67
  # Streamlit Web App
68
- st.title("PDF Table Extractor")
69
 
70
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
71
  if uploaded_file is not None:
72
  try:
73
- tables_data = extract_tables_from_pdf(uploaded_file)
74
- table1_filename, table2_filename, context_filename = save_data(tables_data)
75
-
76
- st.success("Extraction complete.")
77
-
78
- # Provide download links for the output files
79
- with open(table1_filename, 'rb') as f:
80
- st.download_button("Download Table 1 CSV", data=f, file_name='table1.csv', mime='text/csv')
 
 
 
 
 
 
81
 
82
- with open(table2_filename, 'rb') as f:
83
- st.download_button("Download Table 2 CSV", data=f, file_name='table2.csv', mime='text/csv')
 
 
 
 
84
 
85
- with open(context_filename, 'rb') as f:
86
- st.download_button("Download JSON Context", data=f, file_name='tables_context.json', mime='application/json')
 
 
 
 
87
 
88
  except Exception as e:
89
- st.error(f"An error occurred during processing: {e}")
 
4
  import pdfplumber
5
  import pandas as pd
6
  import json
 
7
 
8
+ def extract_tables_with_pdfplumber(file):
9
+ tables = []
10
+ with pdfplumber.open(file) as pdf:
11
+ for page_number, page in enumerate(pdf.pages):
12
+ # Extract existing tables from the current page
 
 
 
 
 
13
  extracted_tables = page.extract_tables()
14
  for table in extracted_tables:
15
+ if table:
16
+ tables.append({
17
+ 'page_number': page_number + 1,
18
+ 'data': table
19
+ })
20
+ return tables
 
 
 
 
 
21
 
22
+ def process_tables(tables):
23
+ # Extract the first three tables assumed to be part of Table 1
24
+ table1_data = [pd.DataFrame(t['data'][1:], columns=t['data'][0]) for t in tables[:3]]
25
+ table1_df = pd.concat(table1_data, ignore_index=True)
26
+ table1_filename = "table1.csv"
27
+ table1_df.to_csv(table1_filename, index=False)
 
 
28
 
29
+ # Extract the last table assumed to be Table 2
30
+ table2_data = pd.DataFrame(tables[-1]['data'][1:], columns=tables[-1]['data'][0])
31
+ table2_filename = "table2.csv"
32
+ table2_data.to_csv(table2_filename, index=False)
33
 
34
+ # Prepare context JSON with detailed context
35
  context = {
36
+ 'tables_extracted': len(tables),
37
  'table1': {
38
+ 'description': 'Table 1 extracted from the first 3 pages',
39
+ 'pages': [t['page_number'] for t in tables[:3]],
40
+ 'csv_path': table1_filename,
41
+ 'rows': table1_df.shape[0],
42
+ 'columns': table1_df.shape[1]
43
  },
44
  'table2': {
45
  'description': 'Table 2 extracted from the last page',
46
+ 'pages': tables[-1]['page_number'],
47
+ 'csv_path': table2_filename,
48
+ 'rows': table2_data.shape[0],
49
+ 'columns': table2_data.shape[1]
50
  }
51
  }
52
+
53
+ context_filename = "tables_context.json"
54
  with open(context_filename, 'w') as json_file:
55
  json.dump(context, json_file, indent=4)
56
 
57
  return table1_filename, table2_filename, context_filename
58
 
59
  # Streamlit Web App
60
+ st.title("PDF Table Extractor with pdfplumber")
61
 
62
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
63
  if uploaded_file is not None:
64
  try:
65
+ tables = extract_tables_with_pdfplumber(uploaded_file)
66
+ if not tables:
67
+ st.error("No tables were extracted from the PDF.")
68
+ else:
69
+ table1, table2, context = process_tables(tables)
70
+ st.success("Extraction complete.")
71
+
72
+ # Provide download links for the output files
73
+ st.download_button(
74
+ label="Download Table 1 CSV",
75
+ data=open(table1, 'r').read(),
76
+ file_name=table1,
77
+ mime='text/csv'
78
+ )
79
 
80
+ st.download_button(
81
+ label="Download Table 2 CSV",
82
+ data=open(table2, 'r').read(),
83
+ file_name=table2,
84
+ mime='text/csv'
85
+ )
86
 
87
+ st.download_button(
88
+ label="Download Context JSON",
89
+ data=open(context, 'r').read(),
90
+ file_name=context,
91
+ mime='application/json'
92
+ )
93
 
94
  except Exception as e:
95
+ st.error(f"An error occurred: {e}")