File size: 1,148 Bytes
e9afe19
 
 
 
 
 
 
fdad087
e9afe19
 
 
 
 
06055f2
e9afe19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import streamlit as st
import subprocess
from subprocess import STDOUT, check_call
import os
import base64
import camelot as cam

@st.cache_data
def gh():
    proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,'wb'), stderr=STDOUT, executable="/bin/bash")

gh()

st.title("Extract Tables from PDFs")

input_pdf = st.file_uploader(label="Upload PDF here",type='pdf')

st.markdown("### Page Number")

page_number = st.text_input("Enter the page # from where you want the table", value=1)

if input_pdf is not None:
    
    with open("input.pdf","wb") as f:
        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
        f.write(base64.b64decode(base64_pdf))
    f.close()

    table = cam.read_pdf("input.pdf",pages = page_number, flavor = 'stream')
    
    st.markdown("## Number of Tables")
    
    st.write(table)
    
    if len(table)>0:
        
        option = st.selectbox(label="Select the table to be displayed", options = range(len(table)+1))
        
        st.markdown("### Output Table")
        
        st.dataframe(table[int(option)-1].df)